diff options
Diffstat (limited to 'kernel')
239 files changed, 19222 insertions, 10679 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 12c679f769c6..72aa080f91f0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -59,15 +59,13 @@ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o -obj-$(CONFIG_CGROUPS) += cgroup.o -obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o -obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o -obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CGROUPS) += cgroup/ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o diff --git a/kernel/acct.c b/kernel/acct.c index 74963d192c5d..5b1284370367 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -56,6 +56,8 @@ #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/uaccess.h> +#include <linux/sched/cputime.h> + #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ #include <linux/pid_namespace.h> @@ -453,8 +455,8 @@ static void fill_ac(acct_t *ac) spin_lock_irq(¤t->sighand->siglock); tty = current->signal->tty; /* Safe as we hold the siglock */ ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; - ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); - ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime)); + ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime)); ac->ac_flag = pacct->ac_flag; ac->ac_mem = encode_comp_t(pacct->ac_mem); ac->ac_minflt = encode_comp_t(pacct->ac_minflt); @@ -530,7 +532,7 @@ out: void acct_collect(long exitcode, int group_dead) { struct pacct_struct *pacct = ¤t->signal->pacct; - cputime_t utime, stime; + u64 utime, stime; unsigned long vsize = 0; if (group_dead && current->mm) { @@ -559,6 +561,7 @@ void acct_collect(long exitcode, int group_dead) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; + task_cputime(current, &utime, &stime); pacct->ac_utime += utime; pacct->ac_stime += stime; diff --git a/kernel/audit.c b/kernel/audit.c index 6e399bb69d7c..4b7d49868ce1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,12 @@ #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/syscalls.h> +#include <linux/spinlock.h> +#include <linux/rcupdate.h> +#include <linux/mutex.h> +#include <linux/gfp.h> +#include <linux/pid.h> +#include <linux/slab.h> #include <linux/audit.h> @@ -90,13 +96,35 @@ static u32 audit_default; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; -/* - * If audit records are to be written to the netlink socket, audit_pid - * contains the pid of the auditd process and audit_nlk_portid contains - * the portid to use to send netlink messages to that process. +/* private audit network namespace index */ +static unsigned int audit_net_id; + +/** + * struct audit_net - audit private network namespace data + * @sk: communication socket + */ +struct audit_net { + struct sock *sk; +}; + +/** + * struct auditd_connection - kernel/auditd connection state + * @pid: auditd PID + * @portid: netlink portid + * @net: the associated network namespace + * @rcu: RCU head + * + * Description: + * This struct is RCU protected; you must either hold the RCU lock for reading + * or the associated spinlock for writing. */ -int audit_pid; -static __u32 audit_nlk_portid; +static struct auditd_connection { + struct pid *pid; + u32 portid; + struct net *net; + struct rcu_head rcu; +} *auditd_conn = NULL; +static DEFINE_SPINLOCK(auditd_conn_lock); /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -121,21 +149,12 @@ u32 audit_sig_sid = 0; 3) suppressed due to audit_rate_limit 4) suppressed due to audit_backlog_limit */ -static atomic_t audit_lost = ATOMIC_INIT(0); - -/* The netlink socket. */ -static struct sock *audit_sock; -static unsigned int audit_net_id; +static atomic_t audit_lost = ATOMIC_INIT(0); /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; -/* The audit_freelist is a list of pre-allocated audit buffers (if more - * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of - * being placed on the freelist). */ -static DEFINE_SPINLOCK(audit_freelist_lock); -static int audit_freelist_count; -static LIST_HEAD(audit_freelist); +static struct kmem_cache *audit_buffer_cache; /* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; @@ -170,17 +189,12 @@ DEFINE_MUTEX(audit_cmd_mutex); * should be at least that large. */ #define AUDIT_BUFSIZ 1024 -/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the - * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ -#define AUDIT_MAXFREE (2*NR_CPUS) - /* The audit_buffer is used when formatting an audit record. The caller * locks briefly to get the record off the freelist or to allocate the * buffer, and locks briefly to send the buffer to the netlink layer or * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { - struct list_head list; struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ gfp_t gfp_mask; @@ -192,12 +206,65 @@ struct audit_reply { struct sk_buff *skb; }; -static void audit_set_portid(struct audit_buffer *ab, __u32 portid) +/** + * auditd_test_task - Check to see if a given task is an audit daemon + * @task: the task to check + * + * Description: + * Return 1 if the task is a registered audit daemon, 0 otherwise. + */ +int auditd_test_task(struct task_struct *task) { - if (ab) { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - nlh->nlmsg_pid = portid; - } + int rc; + struct auditd_connection *ac; + + rcu_read_lock(); + ac = rcu_dereference(auditd_conn); + rc = (ac && ac->pid == task_tgid(task) ? 1 : 0); + rcu_read_unlock(); + + return rc; +} + +/** + * auditd_pid_vnr - Return the auditd PID relative to the namespace + * + * Description: + * Returns the PID in relation to the namespace, 0 on failure. + */ +static pid_t auditd_pid_vnr(void) +{ + pid_t pid; + const struct auditd_connection *ac; + + rcu_read_lock(); + ac = rcu_dereference(auditd_conn); + if (!ac || !ac->pid) + pid = 0; + else + pid = pid_vnr(ac->pid); + rcu_read_unlock(); + + return pid; +} + +/** + * audit_get_sk - Return the audit socket for the given network namespace + * @net: the destination network namespace + * + * Description: + * Returns the sock pointer if valid, NULL otherwise. The caller must ensure + * that a reference is held for the network namespace while the sock is in use. + */ +static struct sock *audit_get_sk(const struct net *net) +{ + struct audit_net *aunet; + + if (!net) + return NULL; + + aunet = net_generic(net, audit_net_id); + return aunet->sk; } void audit_panic(const char *message) @@ -210,9 +277,7 @@ void audit_panic(const char *message) pr_err("%s\n", message); break; case AUDIT_FAIL_PANIC: - /* test audit_pid since printk is always losey, why bother? */ - if (audit_pid) - panic("audit: %s\n", message); + panic("audit: %s\n", message); break; } } @@ -370,21 +435,89 @@ static int audit_set_failure(u32 state) return audit_do_config_change("audit_failure", &audit_failure, state); } -/* - * For one reason or another this nlh isn't getting delivered to the userspace - * audit daemon, just send it to printk. +/** + * auditd_conn_free - RCU helper to release an auditd connection struct + * @rcu: RCU head + * + * Description: + * Drop any references inside the auditd connection tracking struct and free + * the memory. + */ + static void auditd_conn_free(struct rcu_head *rcu) + { + struct auditd_connection *ac; + + ac = container_of(rcu, struct auditd_connection, rcu); + put_pid(ac->pid); + put_net(ac->net); + kfree(ac); + } + +/** + * auditd_set - Set/Reset the auditd connection state + * @pid: auditd PID + * @portid: auditd netlink portid + * @net: auditd network namespace pointer + * + * Description: + * This function will obtain and drop network namespace references as + * necessary. Returns zero on success, negative values on failure. + */ +static int auditd_set(struct pid *pid, u32 portid, struct net *net) +{ + unsigned long flags; + struct auditd_connection *ac_old, *ac_new; + + if (!pid || !net) + return -EINVAL; + + ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL); + if (!ac_new) + return -ENOMEM; + ac_new->pid = get_pid(pid); + ac_new->portid = portid; + ac_new->net = get_net(net); + + spin_lock_irqsave(&auditd_conn_lock, flags); + ac_old = rcu_dereference_protected(auditd_conn, + lockdep_is_held(&auditd_conn_lock)); + rcu_assign_pointer(auditd_conn, ac_new); + spin_unlock_irqrestore(&auditd_conn_lock, flags); + + if (ac_old) + call_rcu(&ac_old->rcu, auditd_conn_free); + + return 0; +} + +/** + * kauditd_print_skb - Print the audit record to the ring buffer + * @skb: audit record + * + * Whatever the reason, this packet may not make it to the auditd connection + * so write it via printk so the information isn't completely lost. */ static void kauditd_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); char *data = nlmsg_data(nlh); - if (nlh->nlmsg_type != AUDIT_EOE) { - if (printk_ratelimit()) - pr_notice("type=%d %s\n", nlh->nlmsg_type, data); - else - audit_log_lost("printk limit exceeded"); - } + if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit()) + pr_notice("type=%d %s\n", nlh->nlmsg_type, data); +} + +/** + * kauditd_rehold_skb - Handle a audit record send failure in the hold queue + * @skb: audit record + * + * Description: + * This should only be used by the kauditd_thread when it fails to flush the + * hold queue. + */ +static void kauditd_rehold_skb(struct sk_buff *skb) +{ + /* put the record back in the queue at the same place */ + skb_queue_head(&audit_hold_queue, skb); } /** @@ -444,65 +577,170 @@ static void kauditd_retry_skb(struct sk_buff *skb) * auditd_reset - Disconnect the auditd connection * * Description: - * Break the auditd/kauditd connection and move all the records in the retry - * queue into the hold queue in case auditd reconnects. The audit_cmd_mutex - * must be held when calling this function. + * Break the auditd/kauditd connection and move all the queued records into the + * hold queue in case auditd reconnects. */ static void auditd_reset(void) { + unsigned long flags; struct sk_buff *skb; + struct auditd_connection *ac_old; - /* break the connection */ - if (audit_sock) { - sock_put(audit_sock); - audit_sock = NULL; - } - audit_pid = 0; - audit_nlk_portid = 0; + /* if it isn't already broken, break the connection */ + spin_lock_irqsave(&auditd_conn_lock, flags); + ac_old = rcu_dereference_protected(auditd_conn, + lockdep_is_held(&auditd_conn_lock)); + rcu_assign_pointer(auditd_conn, NULL); + spin_unlock_irqrestore(&auditd_conn_lock, flags); - /* flush all of the retry queue to the hold queue */ + if (ac_old) + call_rcu(&ac_old->rcu, auditd_conn_free); + + /* flush all of the main and retry queues to the hold queue */ while ((skb = skb_dequeue(&audit_retry_queue))) kauditd_hold_skb(skb); + while ((skb = skb_dequeue(&audit_queue))) + kauditd_hold_skb(skb); } /** - * kauditd_send_unicast_skb - Send a record via unicast to auditd + * auditd_send_unicast_skb - Send a record via unicast to auditd * @skb: audit record + * + * Description: + * Send a skb to the audit daemon, returns positive/zero values on success and + * negative values on failure; in all cases the skb will be consumed by this + * function. If the send results in -ECONNREFUSED the connection with auditd + * will be reset. This function may sleep so callers should not hold any locks + * where this would cause a problem. */ -static int kauditd_send_unicast_skb(struct sk_buff *skb) +static int auditd_send_unicast_skb(struct sk_buff *skb) { int rc; + u32 portid; + struct net *net; + struct sock *sk; + struct auditd_connection *ac; + + /* NOTE: we can't call netlink_unicast while in the RCU section so + * take a reference to the network namespace and grab local + * copies of the namespace, the sock, and the portid; the + * namespace and sock aren't going to go away while we hold a + * reference and if the portid does become invalid after the RCU + * section netlink_unicast() should safely return an error */ + + rcu_read_lock(); + ac = rcu_dereference(auditd_conn); + if (!ac) { + rcu_read_unlock(); + rc = -ECONNREFUSED; + goto err; + } + net = get_net(ac->net); + sk = audit_get_sk(net); + portid = ac->portid; + rcu_read_unlock(); - /* if we know nothing is connected, don't even try the netlink call */ - if (!audit_pid) - return -ECONNREFUSED; + rc = netlink_unicast(sk, skb, portid, 0); + put_net(net); + if (rc < 0) + goto err; - /* get an extra skb reference in case we fail to send */ - skb_get(skb); - rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); - if (rc >= 0) { - consume_skb(skb); - rc = 0; - } + return rc; +err: + if (rc == -ECONNREFUSED) + auditd_reset(); return rc; } +/** + * kauditd_send_queue - Helper for kauditd_thread to flush skb queues + * @sk: the sending sock + * @portid: the netlink destination + * @queue: the skb queue to process + * @retry_limit: limit on number of netlink unicast failures + * @skb_hook: per-skb hook for additional processing + * @err_hook: hook called if the skb fails the netlink unicast send + * + * Description: + * Run through the given queue and attempt to send the audit records to auditd, + * returns zero on success, negative values on failure. It is up to the caller + * to ensure that the @sk is valid for the duration of this function. + * + */ +static int kauditd_send_queue(struct sock *sk, u32 portid, + struct sk_buff_head *queue, + unsigned int retry_limit, + void (*skb_hook)(struct sk_buff *skb), + void (*err_hook)(struct sk_buff *skb)) +{ + int rc = 0; + struct sk_buff *skb; + static unsigned int failed = 0; + + /* NOTE: kauditd_thread takes care of all our locking, we just use + * the netlink info passed to us (e.g. sk and portid) */ + + while ((skb = skb_dequeue(queue))) { + /* call the skb_hook for each skb we touch */ + if (skb_hook) + (*skb_hook)(skb); + + /* can we send to anyone via unicast? */ + if (!sk) { + if (err_hook) + (*err_hook)(skb); + continue; + } + + /* grab an extra skb reference in case of error */ + skb_get(skb); + rc = netlink_unicast(sk, skb, portid, 0); + if (rc < 0) { + /* fatal failure for our queue flush attempt? */ + if (++failed >= retry_limit || + rc == -ECONNREFUSED || rc == -EPERM) { + /* yes - error processing for the queue */ + sk = NULL; + if (err_hook) + (*err_hook)(skb); + if (!skb_hook) + goto out; + /* keep processing with the skb_hook */ + continue; + } else + /* no - requeue to preserve ordering */ + skb_queue_head(queue, skb); + } else { + /* it worked - drop the extra reference and continue */ + consume_skb(skb); + failed = 0; + } + } + +out: + return (rc >= 0 ? 0 : rc); +} + /* * kauditd_send_multicast_skb - Send a record to any multicast listeners * @skb: audit record * * Description: - * This function doesn't consume an skb as might be expected since it has to - * copy it anyways. + * Write a multicast message to anyone listening in the initial network + * namespace. This function doesn't consume an skb as might be expected since + * it has to copy it anyways. */ static void kauditd_send_multicast_skb(struct sk_buff *skb) { struct sk_buff *copy; - struct audit_net *aunet = net_generic(&init_net, audit_net_id); - struct sock *sock = aunet->nlsk; + struct sock *sock = audit_get_sk(&init_net); struct nlmsghdr *nlh; + /* NOTE: we are not taking an additional reference for init_net since + * we don't have to worry about it going away */ + if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) return; @@ -526,149 +764,80 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb) } /** - * kauditd_wake_condition - Return true when it is time to wake kauditd_thread - * - * Description: - * This function is for use by the wait_event_freezable() call in - * kauditd_thread(). + * kauditd_thread - Worker thread to send audit records to userspace + * @dummy: unused */ -static int kauditd_wake_condition(void) -{ - static int pid_last = 0; - int rc; - int pid = audit_pid; - - /* wake on new messages or a change in the connected auditd */ - rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last); - if (rc) - pid_last = pid; - - return rc; -} - static int kauditd_thread(void *dummy) { int rc; - int auditd = 0; - int reschedule = 0; - struct sk_buff *skb; - struct nlmsghdr *nlh; + u32 portid = 0; + struct net *net = NULL; + struct sock *sk = NULL; + struct auditd_connection *ac; #define UNICAST_RETRIES 5 -#define AUDITD_BAD(x,y) \ - ((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES) - - /* NOTE: we do invalidate the auditd connection flag on any sending - * errors, but we only "restore" the connection flag at specific places - * in the loop in order to help ensure proper ordering of audit - * records */ set_freezable(); while (!kthread_should_stop()) { - /* NOTE: possible area for future improvement is to look at - * the hold and retry queues, since only this thread - * has access to these queues we might be able to do - * our own queuing and skip some/all of the locking */ - - /* NOTE: it might be a fun experiment to split the hold and - * retry queue handling to another thread, but the - * synchronization issues and other overhead might kill - * any performance gains */ + /* NOTE: see the lock comments in auditd_send_unicast_skb() */ + rcu_read_lock(); + ac = rcu_dereference(auditd_conn); + if (!ac) { + rcu_read_unlock(); + goto main_queue; + } + net = get_net(ac->net); + sk = audit_get_sk(net); + portid = ac->portid; + rcu_read_unlock(); /* attempt to flush the hold queue */ - while (auditd && (skb = skb_dequeue(&audit_hold_queue))) { - rc = kauditd_send_unicast_skb(skb); - if (rc) { - /* requeue to the same spot */ - skb_queue_head(&audit_hold_queue, skb); - - auditd = 0; - if (AUDITD_BAD(rc, reschedule)) { - mutex_lock(&audit_cmd_mutex); - auditd_reset(); - mutex_unlock(&audit_cmd_mutex); - reschedule = 0; - } - } else - /* we were able to send successfully */ - reschedule = 0; + rc = kauditd_send_queue(sk, portid, + &audit_hold_queue, UNICAST_RETRIES, + NULL, kauditd_rehold_skb); + if (rc < 0) { + sk = NULL; + auditd_reset(); + goto main_queue; } /* attempt to flush the retry queue */ - while (auditd && (skb = skb_dequeue(&audit_retry_queue))) { - rc = kauditd_send_unicast_skb(skb); - if (rc) { - auditd = 0; - if (AUDITD_BAD(rc, reschedule)) { - kauditd_hold_skb(skb); - mutex_lock(&audit_cmd_mutex); - auditd_reset(); - mutex_unlock(&audit_cmd_mutex); - reschedule = 0; - } else - /* temporary problem (we hope), queue - * to the same spot and retry */ - skb_queue_head(&audit_retry_queue, skb); - } else - /* we were able to send successfully */ - reschedule = 0; + rc = kauditd_send_queue(sk, portid, + &audit_retry_queue, UNICAST_RETRIES, + NULL, kauditd_hold_skb); + if (rc < 0) { + sk = NULL; + auditd_reset(); + goto main_queue; } - /* standard queue processing, try to be as quick as possible */ -quick_loop: - skb = skb_dequeue(&audit_queue); - if (skb) { - /* setup the netlink header, see the comments in - * kauditd_send_multicast_skb() for length quirks */ - nlh = nlmsg_hdr(skb); - nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; - - /* attempt to send to any multicast listeners */ - kauditd_send_multicast_skb(skb); - - /* attempt to send to auditd, queue on failure */ - if (auditd) { - rc = kauditd_send_unicast_skb(skb); - if (rc) { - auditd = 0; - if (AUDITD_BAD(rc, reschedule)) { - mutex_lock(&audit_cmd_mutex); - auditd_reset(); - mutex_unlock(&audit_cmd_mutex); - reschedule = 0; - } - - /* move to the retry queue */ - kauditd_retry_skb(skb); - } else - /* everything is working so go fast! */ - goto quick_loop; - } else if (reschedule) - /* we are currently having problems, move to - * the retry queue */ - kauditd_retry_skb(skb); - else - /* dump the message via printk and hold it */ - kauditd_hold_skb(skb); - } else { - /* we have flushed the backlog so wake everyone */ - wake_up(&audit_backlog_wait); - - /* if everything is okay with auditd (if present), go - * to sleep until there is something new in the queue - * or we have a change in the connected auditd; - * otherwise simply reschedule to give things a chance - * to recover */ - if (reschedule) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - } else - wait_event_freezable(kauditd_wait, - kauditd_wake_condition()); - - /* update the auditd connection status */ - auditd = (audit_pid ? 1 : 0); +main_queue: + /* process the main queue - do the multicast send and attempt + * unicast, dump failed record sends to the retry queue; if + * sk == NULL due to previous failures we will just do the + * multicast send and move the record to the retry queue */ + rc = kauditd_send_queue(sk, portid, &audit_queue, 1, + kauditd_send_multicast_skb, + kauditd_retry_skb); + if (sk == NULL || rc < 0) + auditd_reset(); + sk = NULL; + + /* drop our netns reference, no auditd sends past this line */ + if (net) { + put_net(net); + net = NULL; } + + /* we have processed all the queues so wake everyone */ + wake_up(&audit_backlog_wait); + + /* NOTE: we want to wake up if there is anything on the queue, + * regardless of if an auditd is connected, as we need to + * do the multicast send and rotate records from the + * main queue to the retry/hold queues */ + wait_event_freezable(kauditd_wait, + (skb_queue_len(&audit_queue) ? 1 : 0)); } return 0; @@ -678,23 +847,22 @@ int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; - struct net *net = dest->net; - struct audit_net *aunet = net_generic(net, audit_net_id); + struct sock *sk = audit_get_sk(dest->net); /* wait for parent to finish and send an ACK */ mutex_lock(&audit_cmd_mutex); mutex_unlock(&audit_cmd_mutex); while ((skb = __skb_dequeue(&dest->q)) != NULL) - netlink_unicast(aunet->nlsk, skb, dest->portid, 0); + netlink_unicast(sk, skb, dest->portid, 0); - put_net(net); + put_net(dest->net); kfree(dest); return 0; } -struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done, +struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size) { struct sk_buff *skb; @@ -707,7 +875,7 @@ struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done, if (!skb) return NULL; - nlh = nlmsg_put(skb, portid, seq, t, size, flags); + nlh = nlmsg_put(skb, 0, seq, t, size, flags); if (!nlh) goto out_kfree_skb; data = nlmsg_data(nlh); @@ -722,16 +890,15 @@ out_kfree_skb: static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; - struct net *net = reply->net; - struct audit_net *aunet = net_generic(net, audit_net_id); + struct sock *sk = audit_get_sk(reply->net); mutex_lock(&audit_cmd_mutex); mutex_unlock(&audit_cmd_mutex); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ - netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0); - put_net(net); + netlink_unicast(sk, reply->skb, reply->portid, 0); + put_net(reply->net); kfree(reply); return 0; } @@ -752,7 +919,6 @@ static int audit_send_reply_thread(void *arg) static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { - u32 portid = NETLINK_CB(request_skb).portid; struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct sk_buff *skb; struct task_struct *tsk; @@ -762,12 +928,12 @@ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int if (!reply) return; - skb = audit_make_reply(portid, seq, type, done, multi, payload, size); + skb = audit_make_reply(seq, type, done, multi, payload, size); if (!skb) goto out; reply->net = get_net(net); - reply->portid = portid; + reply->portid = NETLINK_CB(request_skb).portid; reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); @@ -947,14 +1113,16 @@ static int audit_set_feature(struct sk_buff *skb) return 0; } -static int audit_replace(pid_t pid) +static int audit_replace(struct pid *pid) { - struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, - &pid, sizeof(pid)); + pid_t pvnr; + struct sk_buff *skb; + pvnr = pid_vnr(pid); + skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); if (!skb) return -ENOMEM; - return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); + return auditd_send_unicast_skb(skb); } static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -981,7 +1149,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); s.enabled = audit_enabled; s.failure = audit_failure; - s.pid = audit_pid; + /* NOTE: use pid_vnr() so the PID is relative to the current + * namespace */ + s.pid = auditd_pid_vnr(); s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); @@ -1007,37 +1177,61 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } if (s.mask & AUDIT_STATUS_PID) { - /* NOTE: we are using task_tgid_vnr() below because - * the s.pid value is relative to the namespace - * of the caller; at present this doesn't matter - * much since you can really only run auditd - * from the initial pid namespace, but something - * to keep in mind if this changes */ - int new_pid = s.pid; - pid_t requesting_pid = task_tgid_vnr(current); - - if ((!new_pid) && (requesting_pid != audit_pid)) { - audit_log_config_change("audit_pid", new_pid, audit_pid, 0); + /* NOTE: we are using the vnr PID functions below + * because the s.pid value is relative to the + * namespace of the caller; at present this + * doesn't matter much since you can really only + * run auditd from the initial pid namespace, but + * something to keep in mind if this changes */ + pid_t new_pid = s.pid; + pid_t auditd_pid; + struct pid *req_pid = task_tgid(current); + + /* sanity check - PID values must match */ + if (new_pid != pid_vnr(req_pid)) + return -EINVAL; + + /* test the auditd connection */ + audit_replace(req_pid); + + auditd_pid = auditd_pid_vnr(); + /* only the current auditd can unregister itself */ + if ((!new_pid) && (new_pid != auditd_pid)) { + audit_log_config_change("audit_pid", new_pid, + auditd_pid, 0); return -EACCES; } - if (audit_pid && new_pid && - audit_replace(requesting_pid) != -ECONNREFUSED) { - audit_log_config_change("audit_pid", new_pid, audit_pid, 0); + /* replacing a healthy auditd is not allowed */ + if (auditd_pid && new_pid) { + audit_log_config_change("audit_pid", new_pid, + auditd_pid, 0); return -EEXIST; } - if (audit_enabled != AUDIT_OFF) - audit_log_config_change("audit_pid", new_pid, audit_pid, 1); + if (new_pid) { - if (audit_sock) - sock_put(audit_sock); - audit_pid = new_pid; - audit_nlk_portid = NETLINK_CB(skb).portid; - sock_hold(skb->sk); - audit_sock = skb->sk; + /* register a new auditd connection */ + err = auditd_set(req_pid, + NETLINK_CB(skb).portid, + sock_net(NETLINK_CB(skb).sk)); + if (audit_enabled != AUDIT_OFF) + audit_log_config_change("audit_pid", + new_pid, + auditd_pid, + err ? 0 : 1); + if (err) + return err; + + /* try to process any backlog */ + wake_up_interruptible(&kauditd_wait); } else { + if (audit_enabled != AUDIT_OFF) + audit_log_config_change("audit_pid", + new_pid, + auditd_pid, 1); + + /* unregister the auditd connection */ auditd_reset(); } - wake_up_interruptible(&kauditd_wait); } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); @@ -1058,6 +1252,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err < 0) return err; } + if (s.mask == AUDIT_STATUS_LOST) { + u32 lost = atomic_xchg(&audit_lost, 0); + + audit_log_config_change("lost", 0, lost, 1); + return lost; + } break; } case AUDIT_GET_FEATURE: @@ -1084,7 +1284,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) break; } - mutex_unlock(&audit_cmd_mutex); audit_log_common_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.*s'", @@ -1100,9 +1299,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) size--; audit_log_n_untrustedstring(ab, data, size); } - audit_set_portid(ab, NETLINK_CB(skb).portid); audit_log_end(ab); - mutex_lock(&audit_cmd_mutex); } break; case AUDIT_ADD_RULE: @@ -1115,8 +1312,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); return -EPERM; } - err = audit_rule_change(msg_type, NETLINK_CB(skb).portid, - seq, data, nlmsg_len(nlh)); + err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh)); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); @@ -1237,11 +1433,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err < 0 ? err : 0; } -/* - * Get message from skb. Each message is processed by audit_receive_msg. - * Malformed skbs with wrong length are discarded silently. +/** + * audit_receive - receive messages from a netlink control socket + * @skb: the message buffer + * + * Parse the provided skb and deal with any messages that may be present, + * malformed skbs are discarded. */ -static void audit_receive_skb(struct sk_buff *skb) +static void audit_receive(struct sk_buff *skb) { struct nlmsghdr *nlh; /* @@ -1254,21 +1453,15 @@ static void audit_receive_skb(struct sk_buff *skb) nlh = nlmsg_hdr(skb); len = skb->len; + mutex_lock(&audit_cmd_mutex); while (nlmsg_ok(nlh, len)) { err = audit_receive_msg(skb, nlh); /* if err or if this message says it wants a response */ if (err || (nlh->nlmsg_flags & NLM_F_ACK)) - netlink_ack(skb, nlh, err); + netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); } -} - -/* Receive messages from netlink socket. */ -static void audit_receive(struct sk_buff *skb) -{ - mutex_lock(&audit_cmd_mutex); - audit_receive_skb(skb); mutex_unlock(&audit_cmd_mutex); } @@ -1292,26 +1485,27 @@ static int __net_init audit_net_init(struct net *net) struct audit_net *aunet = net_generic(net, audit_net_id); - aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); - if (aunet->nlsk == NULL) { + aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); + if (aunet->sk == NULL) { audit_panic("cannot initialize netlink socket in namespace"); return -ENOMEM; } - aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + return 0; } static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); - struct sock *sock = aunet->nlsk; - mutex_lock(&audit_cmd_mutex); - if (sock == audit_sock) - auditd_reset(); - mutex_unlock(&audit_cmd_mutex); - netlink_kernel_release(sock); - aunet->nlsk = NULL; + /* NOTE: you would think that we would want to check the auditd + * connection and potentially reset it here if it lives in this + * namespace, but since the auditd connection tracking struct holds a + * reference to this namespace (see auditd_set()) we are only ever + * going to get here after that connection has been released */ + + netlink_kernel_release(aunet->sk); } static struct pernet_operations audit_net_ops __net_initdata = { @@ -1329,27 +1523,34 @@ static int __init audit_init(void) if (audit_initialized == AUDIT_DISABLED) return 0; - pr_info("initializing netlink subsys (%s)\n", - audit_default ? "enabled" : "disabled"); - register_pernet_subsys(&audit_net_ops); + audit_buffer_cache = kmem_cache_create("audit_buffer", + sizeof(struct audit_buffer), + 0, SLAB_PANIC, NULL); skb_queue_head_init(&audit_queue); skb_queue_head_init(&audit_retry_queue); skb_queue_head_init(&audit_hold_queue); - audit_initialized = AUDIT_INITIALIZED; - audit_enabled = audit_default; - audit_ever_enabled |= !!audit_default; for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); + pr_info("initializing netlink subsys (%s)\n", + audit_default ? "enabled" : "disabled"); + register_pernet_subsys(&audit_net_ops); + + audit_initialized = AUDIT_INITIALIZED; + audit_enabled = audit_default; + audit_ever_enabled |= !!audit_default; + kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { int err = PTR_ERR(kauditd_task); panic("audit: failed to start the kauditd thread (%d)\n", err); } - audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); + audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, + "state=initialized audit_enabled=%u res=1", + audit_enabled); return 0; } @@ -1391,60 +1592,33 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { - unsigned long flags; - if (!ab) return; kfree_skb(ab->skb); - spin_lock_irqsave(&audit_freelist_lock, flags); - if (audit_freelist_count > AUDIT_MAXFREE) - kfree(ab); - else { - audit_freelist_count++; - list_add(&ab->list, &audit_freelist); - } - spin_unlock_irqrestore(&audit_freelist_lock, flags); + kmem_cache_free(audit_buffer_cache, ab); } -static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, - gfp_t gfp_mask, int type) +static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, + gfp_t gfp_mask, int type) { - unsigned long flags; - struct audit_buffer *ab = NULL; - struct nlmsghdr *nlh; - - spin_lock_irqsave(&audit_freelist_lock, flags); - if (!list_empty(&audit_freelist)) { - ab = list_entry(audit_freelist.next, - struct audit_buffer, list); - list_del(&ab->list); - --audit_freelist_count; - } - spin_unlock_irqrestore(&audit_freelist_lock, flags); - - if (!ab) { - ab = kmalloc(sizeof(*ab), gfp_mask); - if (!ab) - goto err; - } + struct audit_buffer *ab; - ab->ctx = ctx; - ab->gfp_mask = gfp_mask; + ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask); + if (!ab) + return NULL; ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) + goto err; - nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0); - if (!nlh) - goto out_kfree_skb; + ab->ctx = ctx; + ab->gfp_mask = gfp_mask; return ab; -out_kfree_skb: - kfree_skb(ab->skb); - ab->skb = NULL; err: audit_buffer_free(ab); return NULL; @@ -1475,10 +1649,10 @@ unsigned int audit_serial(void) } static inline void audit_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial) + struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - *t = CURRENT_TIME; + ktime_get_real_ts64(t); *serial = audit_serial(); } } @@ -1502,7 +1676,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct timespec t; + struct timespec64 t; unsigned int uninitialized_var(serial); if (audit_initialized != AUDIT_INITIALIZED) @@ -1511,20 +1685,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) return NULL; - /* don't ever fail/sleep on these two conditions: + /* NOTE: don't ever fail/sleep on these two conditions: * 1. auditd generated record - since we need auditd to drain the * queue; also, when we are checking for auditd, compare PIDs using * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() * using a PID anchored in the caller's namespace - * 2. audit command message - record types 1000 through 1099 inclusive - * are command messages/records used to manage the kernel subsystem - * and the audit userspace, blocking on these messages could cause - * problems under load so don't do it (note: not all of these - * command types are valid as record types, but it is quicker to - * just check two ints than a series of ints in a if/switch stmt) */ - if (!((audit_pid && audit_pid == task_tgid_vnr(current)) || - (type >= 1000 && type <= 1099))) { - long sleep_time = audit_backlog_wait_time; + * 2. generator holding the audit_cmd_mutex - we don't want to block + * while holding the mutex */ + if (!(auditd_test_task(current) || + (current == __mutex_owner(&audit_cmd_mutex)))) { + long stime = audit_backlog_wait_time; while (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { @@ -1533,14 +1703,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ - if ((gfp_mask & __GFP_DIRECT_RECLAIM) && - (sleep_time > 0)) { + if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - sleep_time = schedule_timeout(sleep_time); + stime = schedule_timeout(stime); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) @@ -1560,8 +1729,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); - audit_log_format(ab, "audit(%lu.%03lu:%u): ", - t.tv_sec, t.tv_nsec/1000000, serial); + audit_log_format(ab, "audit(%llu.%03lu:%u): ", + (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); return ab; } @@ -2119,15 +2288,27 @@ out: */ void audit_log_end(struct audit_buffer *ab) { + struct sk_buff *skb; + struct nlmsghdr *nlh; + if (!ab) return; - if (!audit_rate_check()) { - audit_log_lost("rate limit exceeded"); - } else { - skb_queue_tail(&audit_queue, ab->skb); - wake_up_interruptible(&kauditd_wait); + + if (audit_rate_check()) { + skb = ab->skb; ab->skb = NULL; - } + + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* queue the netlink packet and poke the kauditd thread */ + skb_queue_tail(&audit_queue, skb); + wake_up_interruptible(&kauditd_wait); + } else + audit_log_lost("rate limit exceeded"); + audit_buffer_free(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index 960d49c9db5e..ddfce2ea4891 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -112,7 +112,7 @@ struct audit_context { enum audit_state state, current_state; unsigned int serial; /* serial number for record */ int major; /* syscall number */ - struct timespec ctime; /* time of syscall entry */ + struct timespec64 ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ long return_code;/* syscall return code */ u64 prio; @@ -199,6 +199,9 @@ struct audit_context { struct { int argc; } execve; + struct { + char *name; + } module; }; int fds[2]; struct audit_proctitle proctitle; @@ -215,7 +218,7 @@ extern void audit_log_name(struct audit_context *context, struct audit_names *n, const struct path *path, int record_num, int *call_panic); -extern int audit_pid; +extern int auditd_test_task(struct task_struct *task); #define AUDIT_INODE_BUCKETS 32 extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -234,8 +237,7 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); extern int parent_len(const char *path); extern int audit_compare_dname_path(const char *dname, const char *path, int plen); -extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, - int done, int multi, +extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size); extern void audit_panic(const char *message); @@ -247,10 +249,6 @@ struct audit_netlink_list { int audit_send_list(void *); -struct audit_net { - struct sock *nlsk; -}; - extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; @@ -334,14 +332,7 @@ extern u32 audit_sig_sid; extern int audit_filter(int msgtype, unsigned int listtype); #ifdef CONFIG_AUDITSYSCALL -extern int __audit_signal_info(int sig, struct task_struct *t); -static inline int audit_signal_info(int sig, struct task_struct *t) -{ - if (unlikely((audit_pid && t->tgid == audit_pid) || - (audit_signals && !audit_dummy_context()))) - return __audit_signal_info(sig, t); - return 0; -} +extern int audit_signal_info(int sig, struct task_struct *t); extern void audit_filter_inodes(struct task_struct *, struct audit_context *); extern struct list_head *audit_killed_trees(void); #else diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 7ea57e516029..52f368b6561e 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -103,15 +103,15 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa goto out; } - fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark); + fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group); audit_mark->mark.mask = AUDIT_FS_EVENTS; audit_mark->path = pathname; audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true); + ret = fsnotify_add_mark(&audit_mark->mark, inode, NULL, true); if (ret < 0) { - audit_fsnotify_mark_free(audit_mark); + fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); } out: @@ -168,7 +168,8 @@ static int audit_mark_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie) + const unsigned char *dname, u32 cookie, + struct fsnotify_iter_info *iter_info) { struct audit_fsnotify_mark *audit_mark; const struct inode *inode = NULL; @@ -187,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group, default: BUG(); return 0; - }; + } if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) @@ -201,6 +202,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group, static const struct fsnotify_ops audit_mark_fsnotify_ops = { .handle_event = audit_mark_handle_event, + .free_mark = audit_fsnotify_free_mark, }; static int __init audit_fsnotify_init(void) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 7b44195da81b..011d46e5f73f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -3,13 +3,14 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/kthread.h> +#include <linux/refcount.h> #include <linux/slab.h> struct audit_tree; struct audit_chunk; struct audit_tree { - atomic_t count; + refcount_t count; int goner; struct audit_chunk *root; struct list_head chunks; @@ -77,7 +78,7 @@ static struct audit_tree *alloc_tree(const char *s) tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL); if (tree) { - atomic_set(&tree->count, 1); + refcount_set(&tree->count, 1); tree->goner = 0; INIT_LIST_HEAD(&tree->chunks); INIT_LIST_HEAD(&tree->rules); @@ -91,12 +92,12 @@ static struct audit_tree *alloc_tree(const char *s) static inline void get_tree(struct audit_tree *tree) { - atomic_inc(&tree->count); + refcount_inc(&tree->count); } static inline void put_tree(struct audit_tree *tree) { - if (atomic_dec_and_test(&tree->count)) + if (refcount_dec_and_test(&tree->count)) kfree_rcu(tree, head); } @@ -154,7 +155,7 @@ static struct audit_chunk *alloc_chunk(int count) INIT_LIST_HEAD(&chunk->owners[i].list); chunk->owners[i].index = i; } - fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); + fsnotify_init_mark(&chunk->mark, audit_tree_group); chunk->mark.mask = FS_IN_IGNORED; return chunk; } @@ -163,33 +164,54 @@ enum {HASH_SIZE = 128}; static struct list_head chunk_hash_heads[HASH_SIZE]; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); -static inline struct list_head *chunk_hash(const struct inode *inode) +/* Function to return search key in our hash from inode. */ +static unsigned long inode_to_key(const struct inode *inode) { - unsigned long n = (unsigned long)inode / L1_CACHE_BYTES; + return (unsigned long)inode; +} + +/* + * Function to return search key in our hash from chunk. Key 0 is special and + * should never be present in the hash. + */ +static unsigned long chunk_to_key(struct audit_chunk *chunk) +{ + /* + * We have a reference to the mark so it should be attached to a + * connector. + */ + if (WARN_ON_ONCE(!chunk->mark.connector)) + return 0; + return (unsigned long)chunk->mark.connector->inode; +} + +static inline struct list_head *chunk_hash(unsigned long key) +{ + unsigned long n = key / L1_CACHE_BYTES; return chunk_hash_heads + n % HASH_SIZE; } /* hash_lock & entry->lock is held by caller */ static void insert_hash(struct audit_chunk *chunk) { - struct fsnotify_mark *entry = &chunk->mark; + unsigned long key = chunk_to_key(chunk); struct list_head *list; - if (!entry->inode) + if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED)) return; - list = chunk_hash(entry->inode); + list = chunk_hash(key); list_add_rcu(&chunk->hash, list); } /* called under rcu_read_lock */ struct audit_chunk *audit_tree_lookup(const struct inode *inode) { - struct list_head *list = chunk_hash(inode); + unsigned long key = inode_to_key(inode); + struct list_head *list = chunk_hash(key); struct audit_chunk *p; list_for_each_entry_rcu(p, list, hash) { - /* mark.inode may have gone NULL, but who cares? */ - if (p->mark.inode == inode) { + if (chunk_to_key(p) == key) { atomic_long_inc(&p->refs); return p; } @@ -233,11 +255,15 @@ static void untag_chunk(struct node *p) mutex_lock(&entry->group->mark_mutex); spin_lock(&entry->lock); - if (chunk->dead || !entry->inode) { + /* + * mark_mutex protects mark from getting detached and thus also from + * mark->connector->inode getting NULL. + */ + if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&entry->lock); mutex_unlock(&entry->group->mark_mutex); if (new) - free_chunk(new); + fsnotify_put_mark(&new->mark); goto out; } @@ -261,7 +287,7 @@ static void untag_chunk(struct node *p) if (!new) goto Fallback; - if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode, + if (fsnotify_add_mark_locked(&new->mark, entry->connector->inode, NULL, 1)) { fsnotify_put_mark(&new->mark); goto Fallback; @@ -327,7 +353,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) return -ENOMEM; entry = &chunk->mark; - if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { + if (fsnotify_add_mark(entry, inode, NULL, 0)) { fsnotify_put_mark(entry); return -ENOSPC; } @@ -366,7 +392,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) struct node *p; int n; - old_entry = fsnotify_find_inode_mark(audit_tree_group, inode); + old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks, + audit_tree_group); if (!old_entry) return create_chunk(inode, tree); @@ -393,17 +420,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) mutex_lock(&old_entry->group->mark_mutex); spin_lock(&old_entry->lock); - if (!old_entry->inode) { + /* + * mark_mutex protects mark from getting detached and thus also from + * mark->connector->inode getting NULL. + */ + if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { /* old_entry is being shot, lets just lie */ spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); fsnotify_put_mark(old_entry); - free_chunk(chunk); + fsnotify_put_mark(&chunk->mark); return -ENOENT; } - if (fsnotify_add_mark_locked(chunk_entry, old_entry->group, - old_entry->inode, NULL, 1)) { + if (fsnotify_add_mark_locked(chunk_entry, + old_entry->connector->inode, NULL, 1)) { spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); fsnotify_put_mark(chunk_entry); @@ -588,7 +619,8 @@ int audit_remove_tree_rule(struct audit_krule *rule) static int compare_root(struct vfsmount *mnt, void *arg) { - return d_backing_inode(mnt->mnt_root) == arg; + return inode_to_key(d_backing_inode(mnt->mnt_root)) == + (unsigned long)arg; } void audit_trim_trees(void) @@ -623,9 +655,10 @@ void audit_trim_trees(void) list_for_each_entry(node, &tree->chunks, list) { struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ - struct inode *inode = chunk->mark.inode; node->index |= 1U<<31; - if (iterate_mounts(compare_root, inode, root_mnt)) + if (iterate_mounts(compare_root, + (void *)chunk_to_key(chunk), + root_mnt)) node->index &= ~(1U<<31); } spin_unlock(&hash_lock); @@ -958,7 +991,8 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie) + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) { return 0; } @@ -979,6 +1013,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify static const struct fsnotify_ops audit_tree_ops = { .handle_event = audit_tree_handle_event, .freeing_mark = audit_tree_freeing_mark, + .free_mark = audit_tree_destroy_watch, }; static int __init audit_tree_init(void) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index f79e4658433d..62d686d96581 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -28,6 +28,7 @@ #include <linux/fsnotify_backend.h> #include <linux/namei.h> #include <linux/netlink.h> +#include <linux/refcount.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> @@ -46,7 +47,7 @@ */ struct audit_watch { - atomic_t count; /* reference count */ + refcount_t count; /* reference count */ dev_t dev; /* associated superblock device */ char *path; /* insertion path */ unsigned long ino; /* associated inode number */ @@ -102,7 +103,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode) struct audit_parent *parent = NULL; struct fsnotify_mark *entry; - entry = fsnotify_find_inode_mark(audit_watch_group, inode); + entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group); if (entry) parent = container_of(entry, struct audit_parent, mark); @@ -111,12 +112,12 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode) void audit_get_watch(struct audit_watch *watch) { - atomic_inc(&watch->count); + refcount_inc(&watch->count); } void audit_put_watch(struct audit_watch *watch) { - if (atomic_dec_and_test(&watch->count)) { + if (refcount_dec_and_test(&watch->count)) { WARN_ON(watch->parent); WARN_ON(!list_empty(&watch->rules)); kfree(watch->path); @@ -157,9 +158,9 @@ static struct audit_parent *audit_init_parent(struct path *path) INIT_LIST_HEAD(&parent->watches); - fsnotify_init_mark(&parent->mark, audit_watch_free_mark); + fsnotify_init_mark(&parent->mark, audit_watch_group); parent->mark.mask = AUDIT_FS_WATCH; - ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0); + ret = fsnotify_add_mark(&parent->mark, inode, NULL, 0); if (ret < 0) { audit_free_parent(parent); return ERR_PTR(ret); @@ -178,7 +179,7 @@ static struct audit_watch *audit_init_watch(char *path) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&watch->rules); - atomic_set(&watch->count, 1); + refcount_set(&watch->count, 1); watch->path = path; watch->dev = AUDIT_DEV_UNSET; watch->ino = AUDIT_INO_UNSET; @@ -472,7 +473,8 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie) + const unsigned char *dname, u32 cookie, + struct fsnotify_iter_info *iter_info) { const struct inode *inode; struct audit_parent *parent; @@ -492,7 +494,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, BUG(); inode = NULL; break; - }; + } if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); @@ -506,6 +508,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, static const struct fsnotify_ops audit_watch_fsnotify_ops = { .handle_event = audit_watch_handle_event, + .free_mark = audit_watch_free_mark, }; static int __init audit_watch_init(void) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 880519d6cf2a..0b0aa5854dac 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -338,7 +338,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) entry->rule.listnr != AUDIT_FILTER_USER) return -EINVAL; break; - }; + } switch(f->type) { default: @@ -412,7 +412,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (entry->rule.listnr != AUDIT_FILTER_EXIT) return -EINVAL; break; - }; + } return 0; } @@ -1033,7 +1033,7 @@ out: } /* List rules using struct audit_rule_data. */ -static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q) +static void audit_list_rules(int seq, struct sk_buff_head *q) { struct sk_buff *skb; struct audit_krule *r; @@ -1048,15 +1048,15 @@ static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q) data = audit_krule_to_data(r); if (unlikely(!data)) break; - skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, - 0, 1, data, + skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1, + data, sizeof(*data) + data->buflen); if (skb) skb_queue_tail(q, skb); kfree(data); } } - skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); if (skb) skb_queue_tail(q, skb); } @@ -1085,13 +1085,11 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re /** * audit_rule_change - apply all rules to the specified message type * @type: audit message type - * @portid: target port id for netlink audit messages * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data */ -int audit_rule_change(int type, __u32 portid, int seq, void *data, - size_t datasz) +int audit_rule_change(int type, int seq, void *data, size_t datasz) { int err = 0; struct audit_entry *entry; @@ -1150,7 +1148,7 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq) skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); - audit_list_rules(portid, seq, &dest->q); + audit_list_rules(seq, &dest->q); mutex_unlock(&audit_filter_mutex); tsk = kthread_run(audit_send_list, dest, "audit_send_list"); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cf1fa43512c1..bb724baa7ac9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -73,6 +73,7 @@ #include <linux/ctype.h> #include <linux/string.h> #include <linux/uaccess.h> +#include <linux/fsnotify_backend.h> #include <uapi/linux/limits.h> #include "audit.h" @@ -762,7 +763,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, struct audit_entry *e; enum audit_state state; - if (audit_pid && tsk->tgid == audit_pid) + if (auditd_test_task(tsk)) return AUDIT_DISABLED; rcu_read_lock(); @@ -816,7 +817,7 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { struct audit_names *n; - if (audit_pid && tsk->tgid == audit_pid) + if (auditd_test_task(tsk)) return; rcu_read_lock(); @@ -1221,7 +1222,7 @@ static void show_special(struct audit_context *context, int *call_panic) context->ipc.perm_mode); } break; } - case AUDIT_MQ_OPEN: { + case AUDIT_MQ_OPEN: audit_log_format(ab, "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " "mq_msgsize=%ld mq_curmsgs=%ld", @@ -1230,8 +1231,8 @@ static void show_special(struct audit_context *context, int *call_panic) context->mq_open.attr.mq_maxmsg, context->mq_open.attr.mq_msgsize, context->mq_open.attr.mq_curmsgs); - break; } - case AUDIT_MQ_SENDRECV: { + break; + case AUDIT_MQ_SENDRECV: audit_log_format(ab, "mqdes=%d msg_len=%zd msg_prio=%u " "abs_timeout_sec=%ld abs_timeout_nsec=%ld", @@ -1240,12 +1241,12 @@ static void show_special(struct audit_context *context, int *call_panic) context->mq_sendrecv.msg_prio, context->mq_sendrecv.abs_timeout.tv_sec, context->mq_sendrecv.abs_timeout.tv_nsec); - break; } - case AUDIT_MQ_NOTIFY: { + break; + case AUDIT_MQ_NOTIFY: audit_log_format(ab, "mqdes=%d sigev_signo=%d", context->mq_notify.mqdes, context->mq_notify.sigev_signo); - break; } + break; case AUDIT_MQ_GETSETATTR: { struct mq_attr *attr = &context->mq_getsetattr.mqstat; audit_log_format(ab, @@ -1255,19 +1256,24 @@ static void show_special(struct audit_context *context, int *call_panic) attr->mq_flags, attr->mq_maxmsg, attr->mq_msgsize, attr->mq_curmsgs); break; } - case AUDIT_CAPSET: { + case AUDIT_CAPSET: audit_log_format(ab, "pid=%d", context->capset.pid); audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); - break; } - case AUDIT_MMAP: { + break; + case AUDIT_MMAP: audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, context->mmap.flags); - break; } - case AUDIT_EXECVE: { + break; + case AUDIT_EXECVE: audit_log_execve_info(context, &ab); - break; } + break; + case AUDIT_KERN_MODULE: + audit_log_format(ab, "name="); + audit_log_untrustedstring(ab, context->module.name); + kfree(context->module.name); + break; } audit_log_end(ab); } @@ -1527,7 +1533,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, return; context->serial = 0; - context->ctime = CURRENT_TIME; + ktime_get_real_ts64(&context->ctime); context->in_syscall = 1; context->current_state = state; context->ppid = 0; @@ -1591,7 +1597,7 @@ static inline void handle_one(const struct inode *inode) struct audit_tree_refs *p; struct audit_chunk *chunk; int count; - if (likely(hlist_empty(&inode->i_fsnotify_marks))) + if (likely(!inode->i_fsnotify_marks)) return; context = current->audit_context; p = context->trees; @@ -1634,7 +1640,7 @@ retry: seq = read_seqbegin(&rename_lock); for(;;) { struct inode *inode = d_backing_inode(d); - if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { + if (inode && unlikely(inode->i_fsnotify_marks)) { struct audit_chunk *chunk; chunk = audit_tree_lookup(inode); if (chunk) { @@ -1936,13 +1942,13 @@ EXPORT_SYMBOL_GPL(__audit_inode_child); /** * auditsc_get_stamp - get local copies of audit_context values * @ctx: audit_context for the task - * @t: timespec to store time recorded in the audit_context + * @t: timespec64 to store time recorded in the audit_context * @serial: serial value that is recorded in the audit_context * * Also sets the context as auditable. */ int auditsc_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial) + struct timespec64 *t, unsigned int *serial) { if (!ctx->in_syscall) return 0; @@ -2244,26 +2250,27 @@ void __audit_ptrace(struct task_struct *t) * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ -int __audit_signal_info(int sig, struct task_struct *t) +int audit_signal_info(int sig, struct task_struct *t) { struct audit_aux_data_pids *axp; struct task_struct *tsk = current; struct audit_context *ctx = tsk->audit_context; kuid_t uid = current_uid(), t_uid = task_uid(t); - if (audit_pid && t->tgid == audit_pid) { - if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = task_tgid_nr(tsk); - if (uid_valid(tsk->loginuid)) - audit_sig_uid = tsk->loginuid; - else - audit_sig_uid = uid; - security_task_getsecid(tsk, &audit_sig_sid); - } - if (!audit_signals || audit_dummy_context()) - return 0; + if (auditd_test_task(t) && + (sig == SIGTERM || sig == SIGHUP || + sig == SIGUSR1 || sig == SIGUSR2)) { + audit_sig_pid = task_tgid_nr(tsk); + if (uid_valid(tsk->loginuid)) + audit_sig_uid = tsk->loginuid; + else + audit_sig_uid = uid; + security_task_getsecid(tsk, &audit_sig_sid); } + if (!audit_signals || audit_dummy_context()) + return 0; + /* optimize the common case by putting first signal recipient directly * in audit_context */ if (!ctx->target_pid) { @@ -2368,6 +2375,15 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } +void __audit_log_kern_module(char *name) +{ + struct audit_context *context = current->audit_context; + + context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL); + strcpy(context->module.name, name); + context->type = AUDIT_KERN_MODULE; +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; @@ -2411,7 +2427,7 @@ void audit_core_dumps(long signr) if (unlikely(!ab)) return; audit_log_task(ab); - audit_log_format(ab, " sig=%ld", signr); + audit_log_format(ab, " sig=%ld res=1", signr); audit_log_end(ab); } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1276474ac3cd..e1e5e658f2db 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d55d95dcf49..5e00b2333c26 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016,2017 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -16,6 +17,8 @@ #include <linux/filter.h> #include <linux/perf_event.h> +#include "map_in_map.h" + static void bpf_array_free_percpu(struct bpf_array *array) { int i; @@ -113,6 +116,30 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + array->elem_size * index; } +/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ +static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + u32 elem_size = round_up(map->value_size, 8); + const int ret = BPF_REG_0; + const int map_ptr = BPF_REG_1; + const int index = BPF_REG_2; + + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); + *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + + if (is_power_of_2(elem_size)) { + *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); + } else { + *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); + } + *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(ret, 0); + return insn - insn_buf; +} + /* Called from eBPF program */ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) { @@ -155,7 +182,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_array *array = container_of(map, struct bpf_array, map); - u32 index = *(u32 *)key; + u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { @@ -260,21 +287,17 @@ static void array_map_free(struct bpf_map *map) bpf_map_area_free(array); } -static const struct bpf_map_ops array_ops = { +const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_gen_lookup = array_map_gen_lookup, }; -static struct bpf_map_type_list array_type __read_mostly = { - .ops = &array_ops, - .type = BPF_MAP_TYPE_ARRAY, -}; - -static const struct bpf_map_ops percpu_array_ops = { +const struct bpf_map_ops percpu_array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -283,19 +306,6 @@ static const struct bpf_map_ops percpu_array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list percpu_array_type __read_mostly = { - .ops = &percpu_array_ops, - .type = BPF_MAP_TYPE_PERCPU_ARRAY, -}; - -static int __init register_array_map(void) -{ - bpf_register_map_type(&array_type); - bpf_register_map_type(&percpu_array_type); - return 0; -} -late_initcall(register_array_map); - static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ @@ -399,7 +409,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map) fd_array_map_delete_elem(map, &i); } -static const struct bpf_map_ops prog_array_ops = { +const struct bpf_map_ops prog_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, @@ -409,18 +419,6 @@ static const struct bpf_map_ops prog_array_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, }; -static struct bpf_map_type_list prog_array_type __read_mostly = { - .ops = &prog_array_ops, - .type = BPF_MAP_TYPE_PROG_ARRAY, -}; - -static int __init register_prog_array_map(void) -{ - bpf_register_map_type(&prog_array_type); - return 0; -} -late_initcall(register_prog_array_map); - static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, struct file *map_file) { @@ -511,7 +509,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } -static const struct bpf_map_ops perf_event_array_ops = { +const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, @@ -522,18 +520,6 @@ static const struct bpf_map_ops perf_event_array_ops = { .map_release = perf_event_fd_array_release, }; -static struct bpf_map_type_list perf_event_array_type __read_mostly = { - .ops = &perf_event_array_ops, - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, -}; - -static int __init register_perf_event_array_map(void) -{ - bpf_register_map_type(&perf_event_array_type); - return 0; -} -late_initcall(register_perf_event_array_map); - #ifdef CONFIG_CGROUPS static void *cgroup_fd_array_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, @@ -554,7 +540,7 @@ static void cgroup_fd_array_free(struct bpf_map *map) fd_array_map_free(map); } -static const struct bpf_map_ops cgroup_array_ops = { +const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, @@ -563,16 +549,53 @@ static const struct bpf_map_ops cgroup_array_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, }; +#endif -static struct bpf_map_type_list cgroup_array_type __read_mostly = { - .ops = &cgroup_array_ops, - .type = BPF_MAP_TYPE_CGROUP_ARRAY, -}; +static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; -static int __init register_cgroup_array_map(void) + map = fd_array_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void array_of_map_free(struct bpf_map *map) { - bpf_register_map_type(&cgroup_array_type); - return 0; + /* map->inner_map_meta is only accessed by syscall which + * is protected by fdget/fdput. + */ + bpf_map_meta_free(map->inner_map_meta); + bpf_fd_array_map_clear(map); + fd_array_map_free(map); } -late_initcall(register_cgroup_array_map); -#endif + +static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = array_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +const struct bpf_map_ops array_of_maps_map_ops = { + .map_alloc = array_of_map_alloc, + .map_free = array_of_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_of_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 89b7ef41c86b..e6ef4401a138 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -13,7 +13,7 @@ #define LOCAL_FREE_TARGET (128) #define LOCAL_NR_SCANS LOCAL_FREE_TARGET -#define PERCPU_FREE_TARGET (16) +#define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET /* Helpers to get the local list index */ @@ -213,11 +213,10 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, enum bpf_lru_list_type tgt_free_type) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; - struct bpf_lru_node *node, *tmp_node, *first_node; + struct bpf_lru_node *node, *tmp_node; unsigned int nshrinked = 0; unsigned int i = 0; - first_node = list_first_entry(inactive, struct bpf_lru_node, list); list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { if (bpf_lru_node_is_ref(node)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); @@ -361,7 +360,8 @@ static void __local_list_add_pending(struct bpf_lru *lru, list_add(&node->list, local_pending_list(loc_l)); } -struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) +static struct bpf_lru_node * +__local_list_pop_free(struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; @@ -374,8 +374,8 @@ struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) return node; } -struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, - struct bpf_lru_locallist *loc_l) +static struct bpf_lru_node * +__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; bool force = false; @@ -558,8 +558,9 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) bpf_common_lru_push_free(lru, node); } -void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) +static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, + u32 node_offset, u32 elem_size, + u32 nr_elems) { struct bpf_lru_list *l = &lru->common_lru.lru_list; u32 i; @@ -575,8 +576,9 @@ void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, } } -void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) +static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, + u32 node_offset, u32 elem_size, + u32 nr_elems) { u32 i, pcpu_entries; int cpu; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index da0f53690295..ea6033cba947 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -154,7 +154,7 @@ int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering - * @sk: The socken sending or receiving traffic + * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received * @type: The type of program to be exectuted * @@ -189,10 +189,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, prog = rcu_dereference(cgrp->bpf.effective[type]); if (prog) { unsigned int offset = skb->data - skb_network_header(skb); + struct sock *save_sk = skb->sk; + skb->sk = sk; __skb_push(skb, offset); ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; __skb_pull(skb, offset); + skb->sk = save_sk; } rcu_read_unlock(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 503d4211988a..dedf367f59bb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -28,6 +28,9 @@ #include <linux/moduleloader.h> #include <linux/bpf.h> #include <linux/frame.h> +#include <linux/rbtree_latch.h> +#include <linux/kallsyms.h> +#include <linux/rcupdate.h> #include <asm/unaligned.h> @@ -73,8 +76,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; struct bpf_prog *fp; @@ -95,6 +97,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->aux = aux; fp->aux->prog = fp; + INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); @@ -102,8 +106,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_alloc); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; u32 pages, delta; int ret; @@ -290,6 +293,202 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +static __always_inline void +bpf_get_prog_addr_region(const struct bpf_prog *prog, + unsigned long *symbol_start, + unsigned long *symbol_end) +{ + const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); + unsigned long addr = (unsigned long)hdr; + + WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); + + *symbol_start = addr; + *symbol_end = addr + hdr->pages * PAGE_SIZE; +} + +static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +{ + BUILD_BUG_ON(sizeof("bpf_prog_") + + sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); + + sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); + sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); + *sym = 0; +} + +static __always_inline unsigned long +bpf_get_prog_addr_start(struct latch_tree_node *n) +{ + unsigned long symbol_start, symbol_end; + const struct bpf_prog_aux *aux; + + aux = container_of(n, struct bpf_prog_aux, ksym_tnode); + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + + return symbol_start; +} + +static __always_inline bool bpf_tree_less(struct latch_tree_node *a, + struct latch_tree_node *b) +{ + return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b); +} + +static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long symbol_start, symbol_end; + const struct bpf_prog_aux *aux; + + aux = container_of(n, struct bpf_prog_aux, ksym_tnode); + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + + if (val < symbol_start) + return -1; + if (val >= symbol_end) + return 1; + + return 0; +} + +static const struct latch_tree_ops bpf_tree_ops = { + .less = bpf_tree_less, + .comp = bpf_tree_comp, +}; + +static DEFINE_SPINLOCK(bpf_lock); +static LIST_HEAD(bpf_kallsyms); +static struct latch_tree_root bpf_tree __cacheline_aligned; + +int bpf_jit_kallsyms __read_mostly; + +static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) +{ + WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); + list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms); + latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); +} + +static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) +{ + if (list_empty(&aux->ksym_lnode)) + return; + + latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); + list_del_rcu(&aux->ksym_lnode); +} + +static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) +{ + return fp->jited && !bpf_prog_was_classic(fp); +} + +static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) +{ + return list_empty(&fp->aux->ksym_lnode) || + fp->aux->ksym_lnode.prev == LIST_POISON2; +} + +void bpf_prog_kallsyms_add(struct bpf_prog *fp) +{ + if (!bpf_prog_kallsyms_candidate(fp) || + !capable(CAP_SYS_ADMIN)) + return; + + spin_lock_bh(&bpf_lock); + bpf_prog_ksym_node_add(fp->aux); + spin_unlock_bh(&bpf_lock); +} + +void bpf_prog_kallsyms_del(struct bpf_prog *fp) +{ + if (!bpf_prog_kallsyms_candidate(fp)) + return; + + spin_lock_bh(&bpf_lock); + bpf_prog_ksym_node_del(fp->aux); + spin_unlock_bh(&bpf_lock); +} + +static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) +{ + struct latch_tree_node *n; + + if (!bpf_jit_kallsyms_enabled()) + return NULL; + + n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); + return n ? + container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : + NULL; +} + +const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) +{ + unsigned long symbol_start, symbol_end; + struct bpf_prog *prog; + char *ret = NULL; + + rcu_read_lock(); + prog = bpf_prog_kallsyms_find(addr); + if (prog) { + bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end); + bpf_get_prog_name(prog, sym); + + ret = sym; + if (size) + *size = symbol_end - symbol_start; + if (off) + *off = addr - symbol_start; + } + rcu_read_unlock(); + + return ret; +} + +bool is_bpf_text_address(unsigned long addr) +{ + bool ret; + + rcu_read_lock(); + ret = bpf_prog_kallsyms_find(addr) != NULL; + rcu_read_unlock(); + + return ret; +} + +int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ + unsigned long symbol_start, symbol_end; + struct bpf_prog_aux *aux; + unsigned int it = 0; + int ret = -ERANGE; + + if (!bpf_jit_kallsyms_enabled()) + return ret; + + rcu_read_lock(); + list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) { + if (it++ != symnum) + continue; + + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + bpf_get_prog_name(aux->prog, sym); + + *value = symbol_start; + *type = BPF_SYM_ELF_TYPE; + + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -326,6 +525,24 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) module_memfree(hdr); } +/* This symbol is only overridden by archs that have different + * requirements than the usual eBPF JITs, f.e. when they only + * implement cBPF JIT, do not set images read-only, etc. + */ +void __weak bpf_jit_free(struct bpf_prog *fp) +{ + if (fp->jited) { + struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); + + bpf_jit_binary_unlock_ro(hdr); + bpf_jit_binary_free(hdr); + + WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); + } + + bpf_prog_unlock_free(fp); +} + int bpf_jit_harden __read_mostly; static int bpf_jit_blind_insn(const struct bpf_insn *from, @@ -436,8 +653,7 @@ out: static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); @@ -939,12 +1155,12 @@ out: LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ off = IMM; load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are - * only appearing in the programs where ctx == - * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] - * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, - * internal BPF verifier will check that BPF_R6 == - * ctx. + /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only + * appearing in the programs where ctx == skb + * (see may_access_skb() in the verifier). All programs + * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, + * bpf_convert_filter() saves it in BPF_R6, internal BPF + * verifier will check that BPF_R6 == ctx. * * BPF_ABS and BPF_IND are wrappers of function calls, * so they scratch BPF_R1-BPF_R5 registers, preserve @@ -1154,12 +1370,22 @@ const struct bpf_func_proto bpf_tail_call_proto = { .arg3_type = ARG_ANYTHING, }; -/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ +/* Stub for JITs that only support cBPF. eBPF programs are interpreted. + * It is encouraged to implement bpf_int_jit_compile() instead, so that + * eBPF and implicitly also cBPF can get JITed! + */ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) { return prog; } +/* Stub for JITs that support eBPF. All cBPF code gets transformed into + * eBPF by the kernel and is later compiled by bpf_int_jit_compile(). + */ +void __weak bpf_jit_compile(struct bpf_prog *prog) +{ +} + bool __weak bpf_helper_changes_pkt_data(void *func) { return false; @@ -1173,3 +1399,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, { return -EFAULT; } + +/* All definitions of tracepoints related to BPF. */ +#define CREATE_TRACE_POINTS +#include <linux/bpf_trace.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); +EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a753bbe7df0a..004334ea13ba 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -13,11 +13,13 @@ #include <linux/bpf.h> #include <linux/jhash.h> #include <linux/filter.h> +#include <linux/rculist_nulls.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" +#include "map_in_map.h" struct bucket { - struct hlist_head head; + struct hlist_nulls_head head; raw_spinlock_t lock; }; @@ -29,28 +31,26 @@ struct bpf_htab { struct pcpu_freelist freelist; struct bpf_lru lru; }; - void __percpu *extra_elems; + struct htab_elem *__percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; -enum extra_elem_state { - HTAB_NOT_AN_EXTRA_ELEM = 0, - HTAB_EXTRA_ELEM_FREE, - HTAB_EXTRA_ELEM_USED -}; - /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { - struct hlist_node hash_node; - struct bpf_htab *htab; - struct pcpu_freelist_node fnode; + struct hlist_nulls_node hash_node; + struct { + void *padding; + union { + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; + }; + }; }; union { struct rcu_head rcu; - enum extra_elem_state state; struct bpf_lru_node lru_node; }; u32 hash; @@ -71,6 +71,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static bool htab_is_prealloc(const struct bpf_htab *htab) +{ + return !(htab->map.map_flags & BPF_F_NO_PREALLOC); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -82,6 +87,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size return *(void __percpu **)(l->key + key_size); } +static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) +{ + return *(void **)(l->key + roundup(map->key_size, 8)); +} + static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { return (struct htab_elem *) (htab->elems + i * htab->elem_size); @@ -122,17 +132,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, static int prealloc_init(struct bpf_htab *htab) { + u32 num_entries = htab->map.max_entries; int err = -ENOMEM, i; - htab->elems = bpf_map_area_alloc(htab->elem_size * - htab->map.max_entries); + if (!htab_is_percpu(htab) && !htab_is_lru(htab)) + num_entries += num_possible_cpus(); + + htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries); if (!htab->elems) return -ENOMEM; if (!htab_is_percpu(htab)) goto skip_percpu_elems; - for (i = 0; i < htab->map.max_entries; i++) { + for (i = 0; i < num_entries; i++) { u32 size = round_up(htab->map.value_size, 8); void __percpu *pptr; @@ -160,10 +173,11 @@ skip_percpu_elems: if (htab_is_lru(htab)) bpf_lru_populate(&htab->lru, htab->elems, offsetof(struct htab_elem, lru_node), - htab->elem_size, htab->map.max_entries); + htab->elem_size, num_entries); else - pcpu_freelist_populate(&htab->freelist, htab->elems, - htab->elem_size, htab->map.max_entries); + pcpu_freelist_populate(&htab->freelist, + htab->elems + offsetof(struct htab_elem, fnode), + htab->elem_size, num_entries); return 0; @@ -184,16 +198,22 @@ static void prealloc_destroy(struct bpf_htab *htab) static int alloc_extra_elems(struct bpf_htab *htab) { - void __percpu *pptr; + struct htab_elem *__percpu *pptr, *l_new; + struct pcpu_freelist_node *l; int cpu; - pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); + pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, + GFP_USER | __GFP_NOWARN); if (!pptr) return -ENOMEM; for_each_possible_cpu(cpu) { - ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = - HTAB_EXTRA_ELEM_FREE; + l = pcpu_freelist_pop(&htab->freelist); + /* pop will succeed, since prealloc_init() + * preallocated extra num_possible_cpus elements + */ + l_new = container_of(l, struct htab_elem, fnode); + *per_cpu_ptr(pptr, cpu) = l_new; } htab->extra_elems = pptr; return 0; @@ -217,6 +237,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) int err, i; u64 cost; + BUILD_BUG_ON(offsetof(struct htab_elem, htab) != + offsetof(struct htab_elem, hash_node.pprev)); + BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != + offsetof(struct htab_elem, hash_node.pprev)); + if (lru && !capable(CAP_SYS_ADMIN)) /* LRU implementation is much complicated than other * maps. Hence, limit to CAP_SYS_ADMIN for now. @@ -326,29 +351,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) goto free_htab; for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_HEAD(&htab->buckets[i].head); + INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); raw_spin_lock_init(&htab->buckets[i].lock); } - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ - err = alloc_extra_elems(htab); - if (err) - goto free_buckets; - } - if (prealloc) { err = prealloc_init(htab); if (err) - goto free_extra_elems; + goto free_buckets; + + if (!percpu && !lru) { + /* lru itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ + err = alloc_extra_elems(htab); + if (err) + goto free_prealloc; + } } return &htab->map; -free_extra_elems: - free_percpu(htab->extra_elems); +free_prealloc: + prealloc_destroy(htab); free_buckets: bpf_map_area_free(htab->buckets); free_htab: @@ -366,28 +391,56 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) return &htab->buckets[hash & (htab->n_buckets - 1)]; } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) { return &__select_bucket(htab, hash)->head; } -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, +/* this lookup function can only be called with bucket lock taken */ +static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, void *key, u32 key_size) { + struct hlist_nulls_node *n; struct htab_elem *l; - hlist_for_each_entry_rcu(l, head, hash_node) + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l->hash == hash && !memcmp(&l->key, key, key_size)) return l; return NULL; } -/* Called from syscall or from eBPF program */ +/* can be called without bucket lock. it will repeat the loop in + * the unlikely event when elements moved from one bucket into another + * while link list is being walked + */ +static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, + u32 hash, void *key, + u32 key_size, u32 n_buckets) +{ + struct hlist_nulls_node *n; + struct htab_elem *l; + +again: + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + + if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) + goto again; + + return NULL; +} + +/* Called from syscall or from eBPF program directly, so + * arguments have to match bpf_map_lookup_elem() exactly. + * The return value is adjusted by BPF instructions + * in htab_map_gen_lookup(). + */ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct htab_elem *l; u32 hash, key_size; @@ -400,7 +453,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) head = select_bucket(htab, hash); - l = lookup_elem_raw(head, hash, key, key_size); + l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); return l; } @@ -415,6 +468,30 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* inline bpf_map_lookup_elem() call. + * Instead of: + * bpf_prog + * bpf_map_lookup_elem + * map->ops->map_lookup_elem + * htab_map_lookup_elem + * __htab_map_lookup_elem + * do: + * bpf_prog + * __htab_map_lookup_elem + */ +static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct htab_elem, key) + + round_up(map->key_size, 8)); + return insn - insn_buf; +} + static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); @@ -433,8 +510,9 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) { struct bpf_htab *htab = (struct bpf_htab *)arg; - struct htab_elem *l, *tgt_l; - struct hlist_head *head; + struct htab_elem *l = NULL, *tgt_l; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; unsigned long flags; struct bucket *b; @@ -444,9 +522,9 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) raw_spin_lock_irqsave(&b->lock, flags); - hlist_for_each_entry_rcu(l, head, hash_node) + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); break; } @@ -459,29 +537,30 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct htab_elem *l, *next_l; u32 hash, key_size; - int i; + int i = 0; WARN_ON_ONCE(!rcu_read_lock_held()); key_size = map->key_size; + if (!key) + goto find_first_elem; + hash = htab_map_hash(key, key_size); head = select_bucket(htab, hash); /* lookup the key */ - l = lookup_elem_raw(head, hash, key, key_size); + l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); - if (!l) { - i = 0; + if (!l) goto find_first_elem; - } /* key was found, get next key in the same bucket */ - next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), struct htab_elem, hash_node); if (next_l) { @@ -500,7 +579,7 @@ find_first_elem: head = select_bucket(htab, i); /* pick first element in the bucket */ - next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), struct htab_elem, hash_node); if (next_l) { /* if it's not empty, just return it */ @@ -538,12 +617,15 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { - if (l->state == HTAB_EXTRA_ELEM_USED) { - l->state = HTAB_EXTRA_ELEM_FREE; - return; + struct bpf_map *map = &htab->map; + + if (map->ops->map_fd_put_ptr) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); } - if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + if (htab_is_prealloc(htab)) { pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); @@ -573,43 +655,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, - bool old_elem_exists) + struct htab_elem *old_elem) { u32 size = htab->map.value_size; - bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); - struct htab_elem *l_new; + bool prealloc = htab_is_prealloc(htab); + struct htab_elem *l_new, **pl_new; void __percpu *pptr; - int err = 0; if (prealloc) { - l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); - if (!l_new) - err = -E2BIG; - } else { - if (atomic_inc_return(&htab->count) > htab->map.max_entries) { - atomic_dec(&htab->count); - err = -E2BIG; + if (old_elem) { + /* if we're updating the existing element, + * use per-cpu extra elems to avoid freelist_pop/push + */ + pl_new = this_cpu_ptr(htab->extra_elems); + l_new = *pl_new; + *pl_new = old_elem; } else { - l_new = kmalloc(htab->elem_size, - GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return ERR_PTR(-ENOMEM); - } - } - - if (err) { - if (!old_elem_exists) - return ERR_PTR(err); + struct pcpu_freelist_node *l; - /* if we're updating the existing element and the hash table - * is full, use per-cpu extra elems - */ - l_new = this_cpu_ptr(htab->extra_elems); - if (l_new->state != HTAB_EXTRA_ELEM_FREE) - return ERR_PTR(-E2BIG); - l_new->state = HTAB_EXTRA_ELEM_USED; + l = pcpu_freelist_pop(&htab->freelist); + if (!l) + return ERR_PTR(-E2BIG); + l_new = container_of(l, struct htab_elem, fnode); + } } else { - l_new->state = HTAB_NOT_AN_EXTRA_ELEM; + if (atomic_inc_return(&htab->count) > htab->map.max_entries) + if (!old_elem) { + /* when map is full and update() is replacing + * old element, it's ok to allocate, since + * old element will be freed immediately. + * Otherwise return an error + */ + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); } memcpy(l_new->key, key, key_size); @@ -661,7 +743,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -690,7 +772,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, goto err; l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, - !!l_old); + l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -700,10 +782,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* add new element to the head of the list, so that * concurrent search will find it before old elem */ - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { - hlist_del_rcu(&l_old->hash_node); - free_htab_elem(htab, l_old); + hlist_nulls_del_rcu(&l_old->hash_node); + if (!htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); } ret = 0; err: @@ -716,7 +799,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old = NULL; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -757,10 +840,10 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* add new element to the head of the list, so that * concurrent search will find it before old elem */ - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { bpf_lru_node_set_ref(&l_new->lru_node); - hlist_del_rcu(&l_old->hash_node); + hlist_nulls_del_rcu(&l_old->hash_node); } ret = 0; @@ -781,7 +864,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -815,12 +898,12 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, false); + hash, true, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; } - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); } ret = 0; err: @@ -834,7 +917,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -882,7 +965,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } else { pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; } ret = 0; @@ -910,7 +993,7 @@ static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; @@ -930,7 +1013,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); if (l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); free_htab_elem(htab, l); ret = 0; } @@ -942,7 +1025,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; @@ -962,7 +1045,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); if (l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); ret = 0; } @@ -977,17 +1060,17 @@ static void delete_all_elements(struct bpf_htab *htab) int i; for (i = 0; i < htab->n_buckets; i++) { - struct hlist_head *head = select_bucket(htab, i); - struct hlist_node *n; + struct hlist_nulls_head *head = select_bucket(htab, i); + struct hlist_nulls_node *n; struct htab_elem *l; - hlist_for_each_entry_safe(l, n, head, hash_node) { - hlist_del_rcu(&l->hash_node); - if (l->state != HTAB_EXTRA_ELEM_USED) - htab_elem_free(htab, l); + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + hlist_nulls_del_rcu(&l->hash_node); + htab_elem_free(htab, l); } } } + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -1004,7 +1087,7 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (htab->map.map_flags & BPF_F_NO_PREALLOC) + if (!htab_is_prealloc(htab)) delete_all_elements(htab); else prealloc_destroy(htab); @@ -1014,21 +1097,17 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } -static const struct bpf_map_ops htab_ops = { +const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_map_lookup_elem, .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_gen_lookup = htab_map_gen_lookup, }; -static struct bpf_map_type_list htab_type __read_mostly = { - .ops = &htab_ops, - .type = BPF_MAP_TYPE_HASH, -}; - -static const struct bpf_map_ops htab_lru_ops = { +const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1037,11 +1116,6 @@ static const struct bpf_map_ops htab_lru_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_type __read_mostly = { - .ops = &htab_lru_ops, - .type = BPF_MAP_TYPE_LRU_HASH, -}; - /* Called from eBPF program */ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) { @@ -1115,7 +1189,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, return ret; } -static const struct bpf_map_ops htab_percpu_ops = { +const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1124,12 +1198,7 @@ static const struct bpf_map_ops htab_percpu_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list htab_percpu_type __read_mostly = { - .ops = &htab_percpu_ops, - .type = BPF_MAP_TYPE_PERCPU_HASH, -}; - -static const struct bpf_map_ops htab_lru_percpu_ops = { +const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1138,17 +1207,102 @@ static const struct bpf_map_ops htab_lru_percpu_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { - .ops = &htab_lru_percpu_ops, - .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, -}; +static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map; + + if (attr->value_size != sizeof(u32)) + return ERR_PTR(-EINVAL); + + /* pointer is stored internally */ + attr->value_size = sizeof(void *); + map = htab_map_alloc(attr); + attr->value_size = sizeof(u32); + + return map; +} -static int __init register_htab_map(void) +static void fd_htab_map_free(struct bpf_map *map) { - bpf_register_map_type(&htab_type); - bpf_register_map_type(&htab_percpu_type); - bpf_register_map_type(&htab_lru_type); - bpf_register_map_type(&htab_lru_percpu_type); - return 0; + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_node *n; + struct hlist_nulls_head *head; + struct htab_elem *l; + int i; + + for (i = 0; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + } + + htab_map_free(map); } -late_initcall(register_htab_map); + +/* only called from syscall */ +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags) +{ + void *ptr; + int ret; + u32 ufd = *(u32 *)value; + + ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + ret = htab_map_update_elem(map, key, &ptr, map_flags); + if (ret) + map->ops->map_fd_put_ptr(ptr); + + return ret; +} + +static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_htab_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = htab_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +static void htab_of_map_free(struct bpf_map *map) +{ + bpf_map_meta_free(map->inner_map_meta); + fd_htab_map_free(map); +} + +const struct bpf_map_ops htab_of_maps_map_ops = { + .map_alloc = htab_of_map_alloc, + .map_free = htab_of_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_of_map_lookup_elem, + .map_delete_elem = htab_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 045cbe673356..3d24e238221e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_RAW_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 0b030c9126d3..9bbd33497d3d 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -21,6 +21,7 @@ #include <linux/parser.h> #include <linux/filter.h> #include <linux/bpf.h> +#include <linux/bpf_trace.h> enum bpf_type { BPF_TYPE_UNSPEC = 0, @@ -281,6 +282,13 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) ret = bpf_obj_do_pin(pname, raw, type); if (ret != 0) bpf_any_put(raw, type); + if ((trace_bpf_obj_pin_prog_enabled() || + trace_bpf_obj_pin_map_enabled()) && !ret) { + if (type == BPF_TYPE_PROG) + trace_bpf_obj_pin_prog(raw, ufd, pname); + if (type == BPF_TYPE_MAP) + trace_bpf_obj_pin_map(raw, ufd, pname); + } out: putname(pname); return ret; @@ -342,8 +350,15 @@ int bpf_obj_get_user(const char __user *pathname) else goto out; - if (ret < 0) + if (ret < 0) { bpf_any_put(raw, type); + } else if (trace_bpf_obj_get_prog_enabled() || + trace_bpf_obj_get_map_enabled()) { + if (type == BPF_TYPE_PROG) + trace_bpf_obj_get_prog(raw, ret, pname); + if (type == BPF_TYPE_MAP) + trace_bpf_obj_get_map(raw, ret, pname); + } out: putname(pname); return ret; @@ -414,7 +429,7 @@ static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) static int bpf_fill_super(struct super_block *sb, void *data, int silent) { - static struct tree_descr bpf_rfiles[] = { { "" } }; + static const struct tree_descr bpf_rfiles[] = { { "" } }; struct bpf_mount_opts opts; struct inode *inode; int ret; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c new file mode 100644 index 000000000000..39cfafd895b8 --- /dev/null +++ b/kernel/bpf/lpm_trie.c @@ -0,0 +1,515 @@ +/* + * Longest prefix match list implementation + * + * Copyright (c) 2016,2017 Daniel Mack + * Copyright (c) 2016 David Herrmann + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <net/ipv6.h> + +/* Intermediate node */ +#define LPM_TREE_NODE_FLAG_IM BIT(0) + +struct lpm_trie_node; + +struct lpm_trie_node { + struct rcu_head rcu; + struct lpm_trie_node __rcu *child[2]; + u32 prefixlen; + u32 flags; + u8 data[0]; +}; + +struct lpm_trie { + struct bpf_map map; + struct lpm_trie_node __rcu *root; + size_t n_entries; + size_t max_prefixlen; + size_t data_size; + raw_spinlock_t lock; +}; + +/* This trie implements a longest prefix match algorithm that can be used to + * match IP addresses to a stored set of ranges. + * + * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is + * interpreted as big endian, so data[0] stores the most significant byte. + * + * Match ranges are internally stored in instances of struct lpm_trie_node + * which each contain their prefix length as well as two pointers that may + * lead to more nodes containing more specific matches. Each node also stores + * a value that is defined by and returned to userspace via the update_elem + * and lookup functions. + * + * For instance, let's start with a trie that was created with a prefix length + * of 32, so it can be used for IPv4 addresses, and one single element that + * matches 192.168.0.0/16. The data array would hence contain + * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will + * stick to IP-address notation for readability though. + * + * As the trie is empty initially, the new node (1) will be places as root + * node, denoted as (R) in the example below. As there are no other node, both + * child pointers are %NULL. + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * + * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already + * a node with the same data and a smaller prefix (ie, a less specific one), + * node (2) will become a child of (1). In child index depends on the next bit + * that is outside of what (1) matches, and that bit is 0, so (2) will be + * child[0] of (1): + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | + * +----------------+ + * | (2) | + * | 192.168.0.0/24 | + * | value: 2 | + * | [0] [1] | + * +----------------+ + * + * The child[1] slot of (1) could be filled with another node which has bit #17 + * (the next bit after the ones that (1) matches on) set to 1. For instance, + * 192.168.128.0/24: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (2) | | (3) | + * | 192.168.0.0/24 | | 192.168.128.0/24 | + * | value: 2 | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * + * Let's add another node (4) to the game for 192.168.1.0/24. In order to place + * it, node (1) is looked at first, and because (4) of the semantics laid out + * above (bit #17 is 0), it would normally be attached to (1) as child[0]. + * However, that slot is already allocated, so a new node is needed in between. + * That node does not have a value attached to it and it will never be + * returned to users as result of a lookup. It is only there to differentiate + * the traversal further. It will get a prefix as wide as necessary to + * distinguish its two children: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (4) (I) | | (3) | + * | 192.168.0.0/23 | | 192.168.128.0/24 | + * | value: --- | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * | | + * +----------------+ +----------------+ + * | (2) | | (5) | + * | 192.168.0.0/24 | | 192.168.1.0/24 | + * | value: 2 | | value: 5 | + * | [0] [1] | | [0] [1] | + * +----------------+ +----------------+ + * + * 192.168.1.1/32 would be a child of (5) etc. + * + * An intermediate node will be turned into a 'real' node on demand. In the + * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie. + * + * A fully populated trie would have a height of 32 nodes, as the trie was + * created with a prefix length of 32. + * + * The lookup starts at the root node. If the current node matches and if there + * is a child that can be used to become more specific, the trie is traversed + * downwards. The last node in the traversal that is a non-intermediate one is + * returned. + */ + +static inline int extract_bit(const u8 *data, size_t index) +{ + return !!(data[index / 8] & (1 << (7 - (index % 8)))); +} + +/** + * longest_prefix_match() - determine the longest prefix + * @trie: The trie to get internal sizes from + * @node: The node to operate on + * @key: The key to compare to @node + * + * Determine the longest prefix of @node that matches the bits in @key. + */ +static size_t longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key *key) +{ + size_t prefixlen = 0; + size_t i; + + for (i = 0; i < trie->data_size; i++) { + size_t b; + + b = 8 - fls(node->data[i] ^ key->data[i]); + prefixlen += b; + + if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) + return min(node->prefixlen, key->prefixlen); + + if (b < 8) + break; + } + + return prefixlen; +} + +/* Called from syscall or from eBPF program */ +static void *trie_lookup_elem(struct bpf_map *map, void *_key) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *node, *found = NULL; + struct bpf_lpm_trie_key *key = _key; + + /* Start walking the trie from the root node ... */ + + for (node = rcu_dereference(trie->root); node;) { + unsigned int next_bit; + size_t matchlen; + + /* Determine the longest prefix of @node that matches @key. + * If it's the maximum possible prefix for this trie, we have + * an exact match and can return it directly. + */ + matchlen = longest_prefix_match(trie, node, key); + if (matchlen == trie->max_prefixlen) { + found = node; + break; + } + + /* If the number of bits that match is smaller than the prefix + * length of @node, bail out and return the node we have seen + * last in the traversal (ie, the parent). + */ + if (matchlen < node->prefixlen) + break; + + /* Consider this node as return candidate unless it is an + * artificially added intermediate one. + */ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + found = node; + + /* If the node match is fully satisfied, let's see if we can + * become more specific. Determine the next bit in the key and + * traverse down. + */ + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + + if (!found) + return NULL; + + return found->data + trie->data_size; +} + +static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, + const void *value) +{ + struct lpm_trie_node *node; + size_t size = sizeof(struct lpm_trie_node) + trie->data_size; + + if (value) + size += trie->map.value_size; + + node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return NULL; + + node->flags = 0; + + if (value) + memcpy(node->data + trie->data_size, value, + trie->map.value_size); + + return node; +} + +/* Called from syscall or from eBPF program */ +static int trie_update_elem(struct bpf_map *map, + void *_key, void *value, u64 flags) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node __rcu **slot; + struct bpf_lpm_trie_key *key = _key; + unsigned long irq_flags; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + raw_spin_lock_irqsave(&trie->lock, irq_flags); + + /* Allocate and fill a new node */ + + if (trie->n_entries == trie->map.max_entries) { + ret = -ENOSPC; + goto out; + } + + new_node = lpm_trie_node_alloc(trie, value); + if (!new_node) { + ret = -ENOMEM; + goto out; + } + + trie->n_entries++; + + new_node->prefixlen = key->prefixlen; + RCU_INIT_POINTER(new_node->child[0], NULL); + RCU_INIT_POINTER(new_node->child[1], NULL); + memcpy(new_node->data, key->data, trie->data_size); + + /* Now find a slot to attach the new node. To do that, walk the tree + * from the root and match as many bits as possible for each node until + * we either find an empty slot or a slot that needs to be replaced by + * an intermediate node. + */ + slot = &trie->root; + + while ((node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)))) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen || + node->prefixlen == trie->max_prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + slot = &node->child[next_bit]; + } + + /* If the slot is empty (a free child pointer or an empty root), + * simply assign the @new_node to that slot and be done. + */ + if (!node) { + rcu_assign_pointer(*slot, new_node); + goto out; + } + + /* If the slot we picked already exists, replace it with @new_node + * which already has the correct data array set. + */ + if (node->prefixlen == matchlen) { + new_node->child[0] = node->child[0]; + new_node->child[1] = node->child[1]; + + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + trie->n_entries--; + + rcu_assign_pointer(*slot, new_node); + kfree_rcu(node, rcu); + + goto out; + } + + /* If the new node matches the prefix completely, it must be inserted + * as an ancestor. Simply insert it between @node and *@slot. + */ + if (matchlen == key->prefixlen) { + next_bit = extract_bit(node->data, matchlen); + rcu_assign_pointer(new_node->child[next_bit], node); + rcu_assign_pointer(*slot, new_node); + goto out; + } + + im_node = lpm_trie_node_alloc(trie, NULL); + if (!im_node) { + ret = -ENOMEM; + goto out; + } + + im_node->prefixlen = matchlen; + im_node->flags |= LPM_TREE_NODE_FLAG_IM; + memcpy(im_node->data, node->data, trie->data_size); + + /* Now determine which child to install in which slot */ + if (extract_bit(key->data, matchlen)) { + rcu_assign_pointer(im_node->child[0], node); + rcu_assign_pointer(im_node->child[1], new_node); + } else { + rcu_assign_pointer(im_node->child[0], new_node); + rcu_assign_pointer(im_node->child[1], node); + } + + /* Finally, assign the intermediate node to the determined spot */ + rcu_assign_pointer(*slot, im_node); + +out: + if (ret) { + if (new_node) + trie->n_entries--; + + kfree(new_node); + kfree(im_node); + } + + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + return ret; +} + +static int trie_delete_elem(struct bpf_map *map, void *key) +{ + /* TODO */ + return -ENOSYS; +} + +#define LPM_DATA_SIZE_MAX 256 +#define LPM_DATA_SIZE_MIN 1 + +#define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \ + sizeof(struct lpm_trie_node)) +#define LPM_VAL_SIZE_MIN 1 + +#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X)) +#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) +#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) + +static struct bpf_map *trie_alloc(union bpf_attr *attr) +{ + struct lpm_trie *trie; + u64 cost = sizeof(*trie), cost_per_node; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || + attr->map_flags != BPF_F_NO_PREALLOC || + attr->key_size < LPM_KEY_SIZE_MIN || + attr->key_size > LPM_KEY_SIZE_MAX || + attr->value_size < LPM_VAL_SIZE_MIN || + attr->value_size > LPM_VAL_SIZE_MAX) + return ERR_PTR(-EINVAL); + + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); + if (!trie) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + trie->map.map_type = attr->map_type; + trie->map.key_size = attr->key_size; + trie->map.value_size = attr->value_size; + trie->map.max_entries = attr->max_entries; + trie->data_size = attr->key_size - + offsetof(struct bpf_lpm_trie_key, data); + trie->max_prefixlen = trie->data_size * 8; + + cost_per_node = sizeof(struct lpm_trie_node) + + attr->value_size + trie->data_size; + cost += (u64) attr->max_entries * cost_per_node; + if (cost >= U32_MAX - PAGE_SIZE) { + ret = -E2BIG; + goto out_err; + } + + trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(trie->map.pages); + if (ret) + goto out_err; + + raw_spin_lock_init(&trie->lock); + + return &trie->map; +out_err: + kfree(trie); + return ERR_PTR(ret); +} + +static void trie_free(struct bpf_map *map) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node __rcu **slot; + struct lpm_trie_node *node; + + raw_spin_lock(&trie->lock); + + /* Always start at the root and walk down to a node that has no + * children. Then free that node, nullify its reference in the parent + * and start over. + */ + + for (;;) { + slot = &trie->root; + + for (;;) { + node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)); + if (!node) + goto unlock; + + if (rcu_access_pointer(node->child[0])) { + slot = &node->child[0]; + continue; + } + + if (rcu_access_pointer(node->child[1])) { + slot = &node->child[1]; + continue; + } + + kfree(node); + RCU_INIT_POINTER(*slot, NULL); + break; + } + } + +unlock: + raw_spin_unlock(&trie->lock); +} + +static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + +const struct bpf_map_ops trie_map_ops = { + .map_alloc = trie_alloc, + .map_free = trie_free, + .map_get_next_key = trie_get_next_key, + .map_lookup_elem = trie_lookup_elem, + .map_update_elem = trie_update_elem, + .map_delete_elem = trie_delete_elem, +}; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c new file mode 100644 index 000000000000..59bcdf821ae4 --- /dev/null +++ b/kernel/bpf/map_in_map.c @@ -0,0 +1,97 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/slab.h> +#include <linux/bpf.h> + +#include "map_in_map.h" + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) +{ + struct bpf_map *inner_map, *inner_map_meta; + struct fd f; + + f = fdget(inner_map_ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + /* prog_array->owner_prog_type and owner_jited + * is a runtime binding. Doing static check alone + * in the verifier is not enough. + */ + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + + /* Does not support >1 level map-in-map */ + if (inner_map->inner_map_meta) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + if (!inner_map_meta) { + fdput(f); + return ERR_PTR(-ENOMEM); + } + + inner_map_meta->map_type = inner_map->map_type; + inner_map_meta->key_size = inner_map->key_size; + inner_map_meta->value_size = inner_map->value_size; + inner_map_meta->map_flags = inner_map->map_flags; + inner_map_meta->ops = inner_map->ops; + inner_map_meta->max_entries = inner_map->max_entries; + + fdput(f); + return inner_map_meta; +} + +void bpf_map_meta_free(struct bpf_map *map_meta) +{ + kfree(map_meta); +} + +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + /* No need to compare ops because it is covered by map_type */ + return meta0->map_type == meta1->map_type && + meta0->key_size == meta1->key_size && + meta0->value_size == meta1->value_size && + meta0->map_flags == meta1->map_flags && + meta0->max_entries == meta1->max_entries; +} + +void *bpf_map_fd_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int ufd) +{ + struct bpf_map *inner_map; + struct fd f; + + f = fdget(ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) + inner_map = bpf_map_inc(inner_map, false); + else + inner_map = ERR_PTR(-EINVAL); + + fdput(f); + return inner_map; +} + +void bpf_map_fd_put_ptr(void *ptr) +{ + /* ptr->ops->map_free() has to go through one + * rcu grace period by itself. + */ + bpf_map_put(ptr); +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h new file mode 100644 index 000000000000..177fadb689dc --- /dev/null +++ b/kernel/bpf/map_in_map.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __MAP_IN_MAP_H__ +#define __MAP_IN_MAP_H__ + +#include <linux/types.h> + +struct file; +struct bpf_map; + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); +void bpf_map_meta_free(struct bpf_map *map_meta); +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1); +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, + int ufd); +void bpf_map_fd_put_ptr(void *ptr); + +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index be8519148c25..4dfd6f2ec2f9 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -264,7 +264,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -static const struct bpf_map_ops stack_map_ops = { +const struct bpf_map_ops stack_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, @@ -272,15 +272,3 @@ static const struct bpf_map_ops stack_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, }; - -static struct bpf_map_type_list stack_map_type __read_mostly = { - .ops = &stack_map_ops, - .type = BPF_MAP_TYPE_STACK_TRACE, -}; - -static int __init register_stack_map(void) -{ - bpf_register_map_type(&stack_map_type); - return 0; -} -late_initcall(register_stack_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bbb016adbaeb..fd2411fd6914 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -10,8 +10,10 @@ * General Public License for more details. */ #include <linux/bpf.h> +#include <linux/bpf_trace.h> #include <linux/syscalls.h> #include <linux/slab.h> +#include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/mmzone.h> #include <linux/anon_inodes.h> @@ -25,30 +27,29 @@ DEFINE_PER_CPU(int, bpf_prog_active); int sysctl_unprivileged_bpf_disabled __read_mostly; -static LIST_HEAD(bpf_map_types); +static const struct bpf_map_ops * const bpf_map_types[] = { +#define BPF_PROG_TYPE(_id, _ops) +#define BPF_MAP_TYPE(_id, _ops) \ + [_id] = &_ops, +#include <linux/bpf_types.h> +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { - struct bpf_map_type_list *tl; struct bpf_map *map; - list_for_each_entry(tl, &bpf_map_types, list_node) { - if (tl->type == attr->map_type) { - map = tl->ops->map_alloc(attr); - if (IS_ERR(map)) - return map; - map->ops = tl->ops; - map->map_type = attr->map_type; - return map; - } - } - return ERR_PTR(-EINVAL); -} + if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || + !bpf_map_types[attr->map_type]) + return ERR_PTR(-EINVAL); -/* boot time registration of different map implementations */ -void bpf_register_map_type(struct bpf_map_type_list *tl) -{ - list_add(&tl->list_node, &bpf_map_types); + map = bpf_map_types[attr->map_type]->map_alloc(attr); + if (IS_ERR(map)) + return map; + map->ops = bpf_map_types[attr->map_type]; + map->map_type = attr->map_type; + return map; } void *bpf_map_area_alloc(size_t size) @@ -66,8 +67,7 @@ void *bpf_map_area_alloc(size_t size) return area; } - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags, - PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL); } void bpf_map_area_free(void *area) @@ -213,7 +213,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD map_flags +#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -241,6 +241,7 @@ static int map_create(union bpf_attr *attr) /* failed to allocate fd */ goto free_map; + trace_bpf_map_create(map, err); return err; free_map: @@ -349,6 +350,9 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + err = -ENOTSUPP; } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -365,6 +369,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; + trace_bpf_map_lookup_elem(map, ufd, key, value); err = 0; free_value: @@ -434,11 +439,17 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_array_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + rcu_read_lock(); + err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + attr->flags); + rcu_read_unlock(); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); @@ -447,6 +458,8 @@ static int map_update_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); + if (!err) + trace_bpf_map_update_elem(map, ufd, key, value); free_value: kfree(value); free_key: @@ -492,6 +505,8 @@ static int map_delete_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); + if (!err) + trace_bpf_map_delete_elem(map, ufd, key); free_key: kfree(key); err_put: @@ -520,14 +535,18 @@ static int map_get_next_key(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) - goto err_put; + if (ukey) { + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + } else { + key = NULL; + } err = -ENOMEM; next_key = kmalloc(map->key_size, GFP_USER); @@ -544,6 +563,7 @@ static int map_get_next_key(union bpf_attr *attr) if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; + trace_bpf_map_next_key(map, ufd, key, next_key); err = 0; free_next_key: @@ -555,79 +575,23 @@ err_put: return err; } -static LIST_HEAD(bpf_prog_types); +static const struct bpf_verifier_ops * const bpf_prog_types[] = { +#define BPF_PROG_TYPE(_id, _ops) \ + [_id] = &_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include <linux/bpf_types.h> +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { - struct bpf_prog_type_list *tl; - - list_for_each_entry(tl, &bpf_prog_types, list_node) { - if (tl->type == type) { - prog->aux->ops = tl->ops; - prog->type = type; - return 0; - } - } - - return -EINVAL; -} - -void bpf_register_prog_type(struct bpf_prog_type_list *tl) -{ - list_add(&tl->list_node, &bpf_prog_types); -} - -/* fixup insn->imm field of bpf_call instructions: - * if (insn->imm == BPF_FUNC_map_lookup_elem) - * insn->imm = bpf_map_lookup_elem - __bpf_call_base; - * else if (insn->imm == BPF_FUNC_map_update_elem) - * insn->imm = bpf_map_update_elem - __bpf_call_base; - * else ... - * - * this function is called after eBPF program passed verification - */ -static void fixup_bpf_calls(struct bpf_prog *prog) -{ - const struct bpf_func_proto *fn; - int i; + if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) + return -EINVAL; - for (i = 0; i < prog->len; i++) { - struct bpf_insn *insn = &prog->insnsi[i]; - - if (insn->code == (BPF_JMP | BPF_CALL)) { - /* we reach here when program has bpf_call instructions - * and it passed bpf_check(), means that - * ops->get_func_proto must have been supplied, check it - */ - BUG_ON(!prog->aux->ops->get_func_proto); - - if (insn->imm == BPF_FUNC_get_route_realm) - prog->dst_needed = 1; - if (insn->imm == BPF_FUNC_get_prandom_u32) - bpf_user_rnd_init_once(); - if (insn->imm == BPF_FUNC_xdp_adjust_head) - prog->xdp_adjust_head = 1; - if (insn->imm == BPF_FUNC_tail_call) { - /* mark bpf_tail_call as different opcode - * to avoid conditional branch in - * interpeter for every normal call - * and to prevent accidental JITing by - * JIT compiler that doesn't support - * bpf_tail_call yet - */ - insn->imm = 0; - insn->code |= BPF_X; - continue; - } - - fn = prog->aux->ops->get_func_proto(insn->imm); - /* all functions that have prototype and verifier allowed - * programs to call them, must be real in-kernel functions - */ - BUG_ON(!fn->func); - insn->imm = fn->func - __bpf_call_base; - } - } + prog->aux->ops = bpf_prog_types[type]; + prog->type = type; + return 0; } /* drop refcnt on maps used by eBPF program and free auxilary data */ @@ -697,8 +661,11 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) void bpf_prog_put(struct bpf_prog *prog) { - if (atomic_dec_and_test(&prog->aux->refcnt)) + if (atomic_dec_and_test(&prog->aux->refcnt)) { + trace_bpf_prog_put_rcu(prog); + bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + } } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -807,7 +774,11 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - return __bpf_prog_get(ufd, &type); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type); + + if (!IS_ERR(prog)) + trace_bpf_prog_get_type(prog); + return prog; } EXPORT_SYMBOL_GPL(bpf_prog_get_type); @@ -876,9 +847,6 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; - /* fixup BPF_CALL->imm field */ - fixup_bpf_calls(prog); - /* eBPF program is ready to be JITed */ prog = bpf_prog_select_runtime(prog, &err); if (err < 0) @@ -889,6 +857,8 @@ static int bpf_prog_load(union bpf_attr *attr) /* failed to allocate fd */ goto free_used_maps; + bpf_prog_kallsyms_add(prog); + trace_bpf_prog_load(prog, err); return err; free_used_maps: @@ -1002,6 +972,28 @@ static int bpf_prog_detach(const union bpf_attr *attr) } #endif /* CONFIG_CGROUP_BPF */ +#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration + +static int bpf_prog_test_run(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_prog *prog; + int ret = -ENOTSUPP; + + if (CHECK_ATTR(BPF_PROG_TEST_RUN)) + return -EINVAL; + + prog = bpf_prog_get(attr->test.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->aux->ops->test_run) + ret = prog->aux->ops->test_run(prog, attr, uattr); + + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1068,7 +1060,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; - #ifdef CONFIG_CGROUP_BPF case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); @@ -1077,7 +1068,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_prog_detach(&attr); break; #endif - + case BPF_PROG_TEST_RUN: + err = bpf_prog_test_run(&attr, uattr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdc43b899f28..c5b56c92f8e2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -33,7 +33,7 @@ * - out of bounds or malformed jumps * The second pass is all possible path descent from the 1st insn. * Since it's analyzing all pathes through the program, the length of the - * analysis is limited to 32k insn, which may be hit even if total number of + * analysis is limited to 64k insn, which may be hit even if total number of * insn is less then 4K, but there are too many branches that change stack/regs. * Number of 'branches to be analyzed' is limited to 1k * @@ -143,6 +143,8 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 65536 #define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -296,7 +298,8 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_insn(struct bpf_insn *insn) +static void print_bpf_insn(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); @@ -360,9 +363,19 @@ static void print_bpf_insn(struct bpf_insn *insn) insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM) { - verbose("(%02x) r%d = 0x%x\n", - insn->code, insn->dst_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !env->allow_ptr_leaks) + imm = 0; + + verbose("(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); } else { verbose("BUG_ld_%02x\n", insn->code); return; @@ -481,6 +494,13 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) regs[regno].max_value = BPF_REGISTER_MAX_RANGE; } +static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, + u32 regno) +{ + mark_reg_unknown_value(regs, regno); + reset_reg_range_values(regs, regno); +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -532,6 +552,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) switch (type) { case PTR_TO_MAP_VALUE: case PTR_TO_MAP_VALUE_OR_NULL: + case PTR_TO_MAP_VALUE_ADJ: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: @@ -616,7 +637,8 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); return 0; } } @@ -627,7 +649,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, { struct bpf_map *map = env->cur_state.regs[regno].map_ptr; - if (off < 0 || off + size > map->value_size) { + if (off < 0 || size <= 0 || off + size > map->value_size) { verbose("invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; @@ -635,6 +657,51 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, return 0; } +/* check read/write into an adjusted map element */ +static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno, + int off, int size) +{ + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *reg = &state->regs[regno]; + int err; + + /* We adjusted the register to this map value, so we + * need to change off and size to min_value and max_value + * respectively to make sure our theoretical access will be + * safe. + */ + if (log_level) + print_verifier_state(state); + env->varlen_map_value_access = true; + /* The minimum value is only important with signed + * comparisons where we can't assume the floor of a + * value is 0. If we are using signed variables for our + * index'es we need to make sure that whatever we use + * will have a set floor within our range. + */ + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + err = check_map_access(env, regno, reg->min_value + off, size); + if (err) { + verbose("R%d min value is outside of the array range\n", + regno); + return err; + } + + /* If we haven't set a max value then we need to bail + * since we can't be sure we won't do bad things. + */ + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + regno); + return -EACCES; + } + return check_map_access(env, regno, reg->max_value + off, size); +} + #define MAX_PACKET_OFF 0xffff static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, @@ -647,6 +714,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; + /* fallthrough */ case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: @@ -710,38 +778,56 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } } -static int check_ptr_alignment(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, int off, int size) +static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, + int off, int size) { - if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) { - if (off % size != 0) { - verbose("misaligned access off %d size %d\n", - off, size); - return -EACCES; - } else { - return 0; - } - } - - if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - /* misaligned access to packet is ok on x86,arm,arm64 */ - return 0; - if (reg->id && size != 1) { - verbose("Unknown packet alignment. Only byte-sized access allowed\n"); + verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n"); return -EACCES; } /* skb->data is NET_IP_ALIGN-ed */ - if (reg->type == PTR_TO_PACKET && - (NET_IP_ALIGN + reg->off + off) % size != 0) { + if ((NET_IP_ALIGN + reg->off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", NET_IP_ALIGN, reg->off, off, size); return -EACCES; } + return 0; } +static int check_val_ptr_alignment(const struct bpf_reg_state *reg, + int size) +{ + if (size != 1) { + verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); + return -EACCES; + } + + return 0; +} + +static int check_ptr_alignment(const struct bpf_reg_state *reg, + int off, int size) +{ + switch (reg->type) { + case PTR_TO_PACKET: + return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : + check_pkt_ptr_alignment(reg, off, size); + case PTR_TO_MAP_VALUE_ADJ: + return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : + check_val_ptr_alignment(reg, size); + default: + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", + off, size); + return -EACCES; + } + + return 0; + } +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -763,7 +849,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, if (size < 0) return size; - err = check_ptr_alignment(env, reg, off, size); + err = check_ptr_alignment(reg, off, size); if (err) return err; @@ -775,47 +861,13 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, return -EACCES; } - /* If we adjusted the register to this map value at all then we - * need to change off and size to min_value and max_value - * respectively to make sure our theoretical access will be - * safe. - */ - if (reg->type == PTR_TO_MAP_VALUE_ADJ) { - if (log_level) - print_verifier_state(state); - env->varlen_map_value_access = true; - /* The minimum value is only important with signed - * comparisons where we can't assume the floor of a - * value is 0. If we are using signed variables for our - * index'es we need to make sure that whatever we use - * will have a set floor within our range. - */ - if (reg->min_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); - return -EACCES; - } - err = check_map_access(env, regno, reg->min_value + off, - size); - if (err) { - verbose("R%d min value is outside of the array range\n", - regno); - return err; - } - - /* If we haven't set a max value then we need to bail - * since we can't be sure we won't do bad things. - */ - if (reg->max_value == BPF_REGISTER_MAX_RANGE) { - verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", - regno); - return -EACCES; - } - off += reg->max_value; - } - err = check_map_access(env, regno, off, size); + if (reg->type == PTR_TO_MAP_VALUE_ADJ) + err = check_map_access_adj(env, regno, off, size); + else + err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = UNKNOWN_VALUE; @@ -827,7 +879,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, } err = check_ctx_access(env, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); /* note that reg.[id|off|range] == 0 */ state->regs[value_regno].type = reg_type; } @@ -860,7 +913,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -958,6 +1012,25 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return 0; } +static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, + int access_size, bool zero_size_allowed, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = env->cur_state.regs; + + switch (regs[regno].type) { + case PTR_TO_PACKET: + return check_packet_access(env, regno, 0, access_size); + case PTR_TO_MAP_VALUE: + return check_map_access(env, regno, 0, access_size); + case PTR_TO_MAP_VALUE_ADJ: + return check_map_access_adj(env, regno, 0, access_size); + default: /* const_imm|ptr_to_stack or invalid ptr */ + return check_stack_boundary(env, regno, access_size, + zero_size_allowed, meta); + } +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -993,10 +1066,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_STACK; if (type != PTR_TO_PACKET && type != expected_type) goto err_type; - } else if (arg_type == ARG_CONST_STACK_SIZE || - arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + } else if (arg_type == ARG_CONST_SIZE || + arg_type == ARG_CONST_SIZE_OR_ZERO) { expected_type = CONST_IMM; - if (type != expected_type) + /* One exception. Allow UNKNOWN_VALUE registers when the + * boundaries are known and don't cause unsafe memory accesses + */ + if (type != UNKNOWN_VALUE && type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; @@ -1006,8 +1082,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_STACK || - arg_type == ARG_PTR_TO_RAW_STACK) { + } else if (arg_type == ARG_PTR_TO_MEM || + arg_type == ARG_PTR_TO_UNINIT_MEM) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a CONST_IMM type. Final test @@ -1015,9 +1091,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (type == CONST_IMM && reg->imm == 0) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != expected_type) + else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MAP_VALUE_ADJ && type != expected_type) goto err_type; - meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; + meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -1063,9 +1140,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_STACK_SIZE || - arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { - bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); + } else if (arg_type == ARG_CONST_SIZE || + arg_type == ARG_CONST_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it @@ -1073,14 +1150,50 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (regno == 0) { /* kernel subsystem misconfigured verifier */ - verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); + verbose("ARG_CONST_SIZE cannot be first argument\n"); return -EACCES; } - if (regs[regno - 1].type == PTR_TO_PACKET) - err = check_packet_access(env, regno - 1, 0, reg->imm); - else - err = check_stack_boundary(env, regno - 1, reg->imm, - zero_size_allowed, meta); + + /* If the register is UNKNOWN_VALUE, the access check happens + * using its boundaries. Otherwise, just use its imm + */ + if (type == UNKNOWN_VALUE) { + /* For unprivileged variable accesses, disable raw + * mode so that the program is required to + * initialize all the memory that the helper could + * just partially fill up. + */ + meta = NULL; + + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + regno); + return -EACCES; + } + + if (reg->min_value == 0) { + err = check_helper_mem_access(env, regno - 1, 0, + zero_size_allowed, + meta); + if (err) + return err; + } + + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + err = check_helper_mem_access(env, regno - 1, + reg->max_value, + zero_size_allowed, meta); + if (err) + return err; + } else { + /* register is CONST_IMM */ + err = check_helper_mem_access(env, regno - 1, reg->imm, + zero_size_allowed, meta); + } } return err; @@ -1115,6 +1228,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + if (func_id != BPF_FUNC_map_lookup_elem) + goto error; default: break; } @@ -1154,15 +1271,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn) { int count = 0; - if (fn->arg1_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg2_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg3_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg4_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg5_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; return count > 1 ? -EINVAL : 0; @@ -1191,7 +1308,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) } } -static int check_call(struct bpf_verifier_env *env, int func_id) +static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; @@ -1275,6 +1392,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id) } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + struct bpf_insn_aux_data *insn_aux; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; /* remember map_ptr, so that check_map_access() @@ -1287,6 +1406,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id) } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; + insn_aux = &env->insn_aux_data[insn_idx]; + if (!insn_aux->map_ptr) + insn_aux->map_ptr = meta.map_ptr; + else if (insn_aux->map_ptr != meta.map_ptr) + insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { verbose("unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -1316,7 +1440,7 @@ static int check_packet_ptr_add(struct bpf_verifier_env *env, imm = insn->imm; add_imm: - if (imm <= 0) { + if (imm < 0) { verbose("addition of negative constant to packet pointer is not allowed\n"); return -EACCES; } @@ -1485,22 +1609,54 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; struct bpf_reg_state *src_reg = ®s[insn->src_reg]; u8 opcode = BPF_OP(insn->code); + u64 dst_imm = dst_reg->imm; - /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or' - * insn. Don't care about overflow or negative values, just add them + /* dst_reg->type == CONST_IMM here. Simulate execution of insns + * containing ALU ops. Don't care about overflow or negative + * values, just add/sub/... them; registers are in u64. */ - if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) - dst_reg->imm += insn->imm; - else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) - dst_reg->imm += src_reg->imm; - else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) - dst_reg->imm |= insn->imm; - else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) - dst_reg->imm |= src_reg->imm; - else + if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) { + dst_imm += insn->imm; + } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm += src_reg->imm; + } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) { + dst_imm -= insn->imm; + } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm -= src_reg->imm; + } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) { + dst_imm *= insn->imm; + } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm *= src_reg->imm; + } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) { + dst_imm |= insn->imm; + } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm |= src_reg->imm; + } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) { + dst_imm &= insn->imm; + } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm &= src_reg->imm; + } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) { + dst_imm >>= insn->imm; + } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm >>= src_reg->imm; + } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) { + dst_imm <<= insn->imm; + } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm <<= src_reg->imm; + } else { mark_reg_unknown_value(regs, insn->dst_reg); + goto out; + } + + dst_reg->imm = dst_imm; +out: return 0; } @@ -1779,6 +1935,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } else if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == PTR_TO_STACK && + ((BPF_SRC(insn->code) == BPF_X && + regs[insn->src_reg].type == CONST_IMM) || + BPF_SRC(insn->code) == BPF_K)) { + if (BPF_SRC(insn->code) == BPF_X) + dst_reg->imm += regs[insn->src_reg].imm; + else + dst_reg->imm += insn->imm; + return 0; + } else if (opcode == BPF_ADD && + BPF_CLASS(insn->code) == BPF_ALU64 && (dst_reg->type == PTR_TO_PACKET || (BPF_SRC(insn->code) == BPF_X && regs[insn->src_reg].type == PTR_TO_PACKET))) { @@ -1811,6 +1978,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * register as unknown. */ if (env->allow_ptr_leaks && + BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD && (dst_reg->type == PTR_TO_MAP_VALUE || dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) dst_reg->type = PTR_TO_MAP_VALUE_ADJ; @@ -1859,14 +2027,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, for (i = 0; i < MAX_BPF_REG; i++) if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) - regs[i].range = dst_reg->off; + /* keep the maximum range already checked */ + regs[i].range = max(regs[i].range, dst_reg->off); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) - reg->range = dst_reg->off; + reg->range = max(reg->range, dst_reg->off); } } @@ -1894,6 +2063,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JGT: /* Unsigned comparison, the minimum value is 0. */ false_reg->min_value = 0; + /* fallthrough */ case BPF_JSGT: /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. @@ -1904,6 +2074,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JGE: /* Unsigned comparison, the minimum value is 0. */ false_reg->min_value = 0; + /* fallthrough */ case BPF_JSGE: /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. @@ -1942,6 +2113,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JGT: /* Unsigned comparison, the minimum value is 0. */ true_reg->min_value = 0; + /* fallthrough */ case BPF_JSGT: /* * If this is false, then the val is <= the register, if it is @@ -1953,6 +2125,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JGE: /* Unsigned comparison, the minimum value is 0. */ true_reg->min_value = 0; + /* fallthrough */ case BPF_JSGE: /* If this is false then constant < register, if it is true then * the register < constant. @@ -1974,14 +2147,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, struct bpf_reg_state *reg = ®s[regno]; if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { - reg->type = type; + if (type == UNKNOWN_VALUE) { + __mark_reg_unknown_value(regs, regno); + } else if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = type; + } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances * to take effect. */ reg->id = 0; - if (type == UNKNOWN_VALUE) - __mark_reg_unknown_value(regs, regno); } } @@ -2144,14 +2322,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (insn->src_reg == 0) { - /* generic move 64-bit immediate into a register, - * only analyzer needs to collect the ld_imm value. - */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - if (!env->analyzer_ops) - return 0; - regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = imm; return 0; @@ -2664,7 +2836,7 @@ static int do_check(struct bpf_verifier_env *env) class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { - verbose("BPF program is too large. Proccessed %d insn\n", + verbose("BPF program is too large. Processed %d insn\n", insn_processed); return -E2BIG; } @@ -2692,7 +2864,7 @@ static int do_check(struct bpf_verifier_env *env) if (log_level) { verbose("%d: ", insn_idx); - print_bpf_insn(insn); + print_bpf_insn(env, insn); } err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); @@ -2729,7 +2901,6 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - reset_reg_range_values(regs, insn->dst_reg); if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { insn_idx++; @@ -2829,7 +3000,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - err = check_call(env, insn->imm); + err = check_call(env, insn->imm, insn_idx); if (err) return err; @@ -2913,16 +3084,33 @@ process_bpf_exit: return 0; } +static int check_map_prealloc(struct bpf_map *map) +{ + return (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || + !(map->map_flags & BPF_F_NO_PREALLOC); +} + static int check_map_prog_compatibility(struct bpf_map *map, struct bpf_prog *prog) { - if (prog->type == BPF_PROG_TYPE_PERF_EVENT && - (map->map_type == BPF_MAP_TYPE_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && - (map->map_flags & BPF_F_NO_PREALLOC)) { - verbose("perf_event programs can only use preallocated hash map\n"); - return -EINVAL; + /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use + * preallocated hash maps, since doing memory allocation + * in overflow_handler can crash depending on where nmi got + * triggered. + */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { + if (!check_map_prealloc(map)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + if (map->inner_map_meta && + !check_map_prealloc(map->inner_map_meta)) { + verbose("perf_event programs can only use preallocated inner hash map\n"); + return -EINVAL; + } } return 0; } @@ -3051,6 +3239,41 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) insn->src_reg = 0; } +/* single env->prog->insni[off] instruction was replaced with the range + * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying + * [0, off) and [off, end) to new locations, so the patched range stays zero + */ +static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, + u32 off, u32 cnt) +{ + struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + + if (cnt == 1) + return 0; + new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len); + if (!new_data) + return -ENOMEM; + memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); + memcpy(new_data + off + cnt - 1, old_data + off, + sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + env->insn_aux_data = new_data; + vfree(old_data); + return 0; +} + +static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, + const struct bpf_insn *patch, u32 len) +{ + struct bpf_prog *new_prog; + + new_prog = bpf_patch_insn_single(env->prog, off, patch, len); + if (!new_prog) + return NULL; + if (adjust_insn_aux_data(env, new_prog->len, off, len)) + return NULL; + return new_prog; +} + /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ @@ -3070,10 +3293,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) verbose("bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { - new_prog = bpf_patch_insn_single(env->prog, 0, - insn_buf, cnt); + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); if (!new_prog) return -ENOMEM; + env->prog = new_prog; delta += cnt - 1; } @@ -3085,27 +3308,29 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn = env->prog->insnsi + delta; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || + insn->code == (BPF_LDX | BPF_MEM | BPF_H) || + insn->code == (BPF_LDX | BPF_MEM | BPF_W) || insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || + else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || + insn->code == (BPF_STX | BPF_MEM | BPF_H) || + insn->code == (BPF_STX | BPF_MEM | BPF_W) || insn->code == (BPF_STX | BPF_MEM | BPF_DW)) type = BPF_WRITE; else continue; - if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; - cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf, env->prog); + cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; } - new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf, - cnt); + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -3119,6 +3344,89 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } +/* fixup insn->imm field of bpf_call instructions + * and inline eligible helpers as explicit sequence of BPF instructions + * + * this function is called after eBPF program passed verification + */ +static int fixup_bpf_calls(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = prog->insnsi; + const struct bpf_func_proto *fn; + const int insn_cnt = prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + struct bpf_map *map_ptr; + int i, cnt, delta = 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL)) + continue; + + if (insn->imm == BPF_FUNC_get_route_realm) + prog->dst_needed = 1; + if (insn->imm == BPF_FUNC_get_prandom_u32) + bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_tail_call) { + /* If we tail call into other programs, we + * cannot make any assumptions since they can + * be replaced dynamically during runtime in + * the program array. + */ + prog->cb_access = 1; + + /* mark bpf_tail_call as different opcode to avoid + * conditional branch in the interpeter for every normal + * call and to prevent accidental JITing by JIT compiler + * that doesn't support bpf_tail_call yet + */ + insn->imm = 0; + insn->code |= BPF_X; + continue; + } + + if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) { + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON || + !map_ptr->ops->map_gen_lookup) + goto patch_call_imm; + + cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, + cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + + /* keep walking new program and skip insns we just inserted */ + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + +patch_call_imm: + fn = prog->aux->ops->get_func_proto(insn->imm); + /* all functions that have prototype and verifier allowed + * programs to call them, must be real in-kernel functions + */ + if (!fn->func) { + verbose("kernel subsystem misconfigured func %s#%d\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } + insn->imm = fn->func - __bpf_call_base; + } + + return 0; +} + static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl, *sln; @@ -3214,6 +3522,9 @@ skip_full_check: /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); + if (ret == 0) + ret = fixup_bpf_calls(env); + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile new file mode 100644 index 000000000000..387348a40c64 --- /dev/null +++ b/kernel/cgroup/Makefile @@ -0,0 +1,6 @@ +obj-y := cgroup.o namespace.o cgroup-v1.o + +obj-$(CONFIG_CGROUP_FREEZER) += freezer.o +obj-$(CONFIG_CGROUP_PIDS) += pids.o +obj-$(CONFIG_CGROUP_RDMA) += rdma.o +obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h new file mode 100644 index 000000000000..00f4d6bf048f --- /dev/null +++ b/kernel/cgroup/cgroup-internal.h @@ -0,0 +1,215 @@ +#ifndef __CGROUP_INTERNAL_H +#define __CGROUP_INTERNAL_H + +#include <linux/cgroup.h> +#include <linux/kernfs.h> +#include <linux/workqueue.h> +#include <linux/list.h> +#include <linux/refcount.h> + +/* + * A cgroup can be associated with multiple css_sets as different tasks may + * belong to different cgroups on different hierarchies. In the other + * direction, a css_set is naturally associated with multiple cgroups. + * This M:N relationship is represented by the following link structure + * which exists for each association and allows traversing the associations + * from both sides. + */ +struct cgrp_cset_link { + /* the cgroup and css_set this link associates */ + struct cgroup *cgrp; + struct css_set *cset; + + /* list of cgrp_cset_links anchored at cgrp->cset_links */ + struct list_head cset_link; + + /* list of cgrp_cset_links anchored at css_set->cgrp_links */ + struct list_head cgrp_link; +}; + +/* used to track tasks and csets during migration */ +struct cgroup_taskset { + /* the src and dst cset list running through cset->mg_node */ + struct list_head src_csets; + struct list_head dst_csets; + + /* the subsys currently being processed */ + int ssid; + + /* + * Fields for cgroup_taskset_*() iteration. + * + * Before migration is committed, the target migration tasks are on + * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of + * the csets on ->dst_csets. ->csets point to either ->src_csets + * or ->dst_csets depending on whether migration is committed. + * + * ->cur_csets and ->cur_task point to the current task position + * during iteration. + */ + struct list_head *csets; + struct css_set *cur_cset; + struct task_struct *cur_task; +}; + +/* migration context also tracks preloading */ +struct cgroup_mgctx { + /* + * Preloaded source and destination csets. Used to guarantee + * atomic success or failure on actual migration. + */ + struct list_head preloaded_src_csets; + struct list_head preloaded_dst_csets; + + /* tasks and csets to migrate */ + struct cgroup_taskset tset; + + /* subsystems affected by migration */ + u16 ss_mask; +}; + +#define CGROUP_TASKSET_INIT(tset) \ +{ \ + .src_csets = LIST_HEAD_INIT(tset.src_csets), \ + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ + .csets = &tset.src_csets, \ +} + +#define CGROUP_MGCTX_INIT(name) \ +{ \ + LIST_HEAD_INIT(name.preloaded_src_csets), \ + LIST_HEAD_INIT(name.preloaded_dst_csets), \ + CGROUP_TASKSET_INIT(name.tset), \ +} + +#define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + +struct cgroup_sb_opts { + u16 subsys_mask; + unsigned int flags; + char *release_agent; + bool cpuset_clone_children; + char *name; + /* User explicitly requested empty subsystem */ + bool none; +}; + +extern struct mutex cgroup_mutex; +extern spinlock_t css_set_lock; +extern struct cgroup_subsys *cgroup_subsys[]; +extern struct list_head cgroup_roots; +extern struct file_system_type cgroup_fs_type; + +/* iterate across the hierarchies */ +#define for_each_root(root) \ + list_for_each_entry((root), &cgroup_roots, root_list) + +/** + * for_each_subsys - iterate all enabled cgroup subsystems + * @ss: the iteration cursor + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + */ +#define for_each_subsys(ss, ssid) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) + +static inline bool cgroup_is_dead(const struct cgroup *cgrp) +{ + return !(cgrp->self.flags & CSS_ONLINE); +} + +static inline bool notify_on_release(const struct cgroup *cgrp) +{ + return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); +} + +void put_css_set_locked(struct css_set *cset); + +static inline void put_css_set(struct css_set *cset) +{ + unsigned long flags; + + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (refcount_dec_not_one(&cset->refcount)) + return; + + spin_lock_irqsave(&css_set_lock, flags); + put_css_set_locked(cset); + spin_unlock_irqrestore(&css_set_lock, flags); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cset) +{ + refcount_inc(&cset->refcount); +} + +bool cgroup_ssid_enabled(int ssid); +bool cgroup_on_dfl(const struct cgroup *cgrp); + +struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); +struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root); +struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline); +void cgroup_kn_unlock(struct kernfs_node *kn); +int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); + +void cgroup_free_root(struct cgroup_root *root); +void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); +int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, + struct cgroup_root *root, unsigned long magic, + struct cgroup_namespace *ns); + +bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); +void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); +void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, + struct cgroup_mgctx *mgctx); +int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx); +int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup_mgctx *mgctx); + +int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, + bool threadgroup); +ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup); +ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + +void cgroup_lock_and_drain_offline(struct cgroup *cgrp); + +int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode); +int cgroup_rmdir(struct kernfs_node *kn); +int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root); + +/* + * namespace.c + */ +extern const struct proc_ns_operations cgroupns_operations; + +/* + * cgroup-v1.c + */ +extern struct cftype cgroup1_base_files[]; +extern const struct file_operations proc_cgroupstats_operations; +extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; + +bool cgroup1_ssid_disabled(int ssid); +void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); +void cgroup1_release_agent(struct work_struct *work); +void cgroup1_check_for_release(struct cgroup *cgrp); +struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, + void *data, unsigned long magic, + struct cgroup_namespace *ns); + +#endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c new file mode 100644 index 000000000000..85d75152402d --- /dev/null +++ b/kernel/cgroup/cgroup-v1.c @@ -0,0 +1,1412 @@ +#include "cgroup-internal.h" + +#include <linux/ctype.h> +#include <linux/kmod.h> +#include <linux/sort.h> +#include <linux/delay.h> +#include <linux/mm.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/magic.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/delayacct.h> +#include <linux/pid_namespace.h> +#include <linux/cgroupstats.h> + +#include <trace/events/cgroup.h> + +/* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + +/* Controllers blocked by the commandline in v1 */ +static u16 cgroup_no_v1_mask; + +/* + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. + */ +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +static DEFINE_SPINLOCK(release_agent_path_lock); + +bool cgroup1_ssid_disabled(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) +{ + struct cgroup_root *root; + int retval = 0; + + mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + for_each_root(root) { + struct cgroup *from_cgrp; + + if (root == &cgrp_dfl_root) + continue; + + spin_lock_irq(&css_set_lock); + from_cgrp = task_cgroup_from_root(from, root); + spin_unlock_irq(&css_set_lock); + + retval = cgroup_attach_task(from_cgrp, tsk, false); + if (retval) + break; + } + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + +/** + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside + * + * Locking rules between cgroup_post_fork() and the migration path + * guarantee that, if a task is forking while being migrated, the new child + * is guaranteed to be either visible in the source cgroup after the + * parent's migration is complete or put into the target cgroup. No task + * can slip out of migration through forking. + */ +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +{ + DEFINE_CGROUP_MGCTX(mgctx); + struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; + int ret; + + if (cgroup_on_dfl(to)) + return -EINVAL; + + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + + mutex_lock(&cgroup_mutex); + + percpu_down_write(&cgroup_threadgroup_rwsem); + + /* all tasks in @from are being moved, all csets are source */ + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &from->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, to, &mgctx); + spin_unlock_irq(&css_set_lock); + + ret = cgroup_migrate_prepare_dst(&mgctx); + if (ret) + goto out_err; + + /* + * Migrate tasks one-by-one until @from is empty. This fails iff + * ->can_attach() fails. + */ + do { + css_task_iter_start(&from->self, &it); + task = css_task_iter_next(&it); + if (task) + get_task_struct(task); + css_task_iter_end(&it); + + if (task) { + ret = cgroup_migrate(task, false, &mgctx); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); + put_task_struct(task); + } + } while (task && !ret); +out_err: + cgroup_migrate_finish(&mgctx); + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); + return ret; +} + +/* + * Stuff for reading the 'tasks'/'procs' files. + * + * Reading this file can return large amounts of data if a cgroup has + * *lots* of attached tasks. So it may need several calls to read(), + * but we cannot guarantee that the information we produce is correct + * unless we produce it entirely atomically. + * + */ + +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* for delayed destruction */ + struct delayed_work destroy_dwork; +}; + +/* + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} + +static void pidlist_free(void *p) +{ + kvfree(p); +} + +/* + * Used to destroy all pidlists lingering waiting for destroy timer. None + * should be left afterwards. + */ +void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) +{ + struct cgroup_pidlist *l, *tmp_l; + + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); + mutex_unlock(&cgrp->pidlist_mutex); + + flush_workqueue(cgroup_pidlist_destroy_wq); + BUG_ON(!list_empty(&cgrp->pidlists)); +} + +static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, + destroy_dwork); + struct cgroup_pidlist *tofree = NULL; + + mutex_lock(&l->owner->pidlist_mutex); + + /* + * Destroy iff we didn't get queued again. The state won't change + * as destroy_dwork can only be queued while locked. + */ + if (!delayed_work_pending(dwork)) { + list_del(&l->links); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + tofree = l; + } + + mutex_unlock(&l->owner->pidlist_mutex); + kfree(tofree); +} + +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * Returns the number of unique elements. + */ +static int pidlist_uniq(pid_t *list, int length) +{ + int src, dest = 1; + + /* + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either + */ + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + return dest; +} + +/* + * The two pid files - task and cgroup.procs - guaranteed that the result + * is sorted, which forced this whole pidlist fiasco. As pid order is + * different per namespace, each namespace needs differently sorted list, + * making it impossible to use, for example, single rbtree of member tasks + * sorted by task pointer. As pidlists can be fairly large, allocating one + * per open file is dangerous, so cgroup had to implement shared pool of + * pidlists keyed by cgroup and namespace. + */ +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = task_active_pid_ns(current); + + lockdep_assert_held(&cgrp->pidlist_mutex); + + list_for_each_entry(l, &cgrp->pidlists, links) + if (l->key.type == type && l->key.ns == ns) + return l; + return NULL; +} + +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. + */ +static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + + lockdep_assert_held(&cgrp->pidlist_mutex); + + l = cgroup_pidlist_find(cgrp, type); + if (l) + return l; + + /* entry not found; create a new one */ + l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) + return l; + + INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); + l->key.type = type; + /* don't need task_nsproxy() if we're looking at ourself */ + l->key.ns = get_pid_ns(task_active_pid_ns(current)); + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + return l; +} + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. + */ +static int cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += refcount_read(&link->cset->refcount); + spin_unlock_irq(&css_set_lock); + return count; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ + struct css_task_iter it; + struct task_struct *tsk; + struct cgroup_pidlist *l; + + lockdep_assert_held(&cgrp->pidlist_mutex); + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + if (unlikely(n == length)) + break; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; + } + css_task_iter_end(&it); + length = n; + /* now sort & (if procs) strip out duplicates */ + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(array, length); + + l = cgroup_pidlist_find_create(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + + /* store array, freeing old if necessary */ + pidlist_free(l->list); + l->list = array; + l->length = length; + *lp = l; + return 0; +} + +/* + * seq_file methods for the tasks/procs files. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->l->list array. + */ + +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) +{ + /* + * Initially we receive a position value that corresponds to + * one more than the last pid shown (or 0 on the first call or + * after a seek to the start). Use a binary-search to find the + * next pid to display, if any + */ + struct kernfs_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct cgroup_pidlist *l; + enum cgroup_filetype type = seq_cft(s)->private; + int index = 0, pid = *pos; + int *iter, ret; + + mutex_lock(&cgrp->pidlist_mutex); + + /* + * !NULL @of->priv indicates that this isn't the first start() + * after open. If the matching pidlist is around, we can use that. + * Look for it. Note that @of->priv can't be used directly. It + * could already have been destroyed. + */ + if (of->priv) + of->priv = cgroup_pidlist_find(cgrp, type); + + /* + * Either this is the first start() after open or the matching + * pidlist has been destroyed inbetween. Create a new one. + */ + if (!of->priv) { + ret = pidlist_array_load(cgrp, type, + (struct cgroup_pidlist **)&of->priv); + if (ret) + return ERR_PTR(ret); + } + l = of->priv; + + if (pid) { + int end = l->length; + + while (index < end) { + int mid = (index + end) / 2; + if (l->list[mid] == pid) { + index = mid; + break; + } else if (l->list[mid] <= pid) + index = mid + 1; + else + end = mid; + } + } + /* If we're off the end of the array, we're done */ + if (index >= l->length) + return NULL; + /* Update the abstract position to be the actual pid that we found */ + iter = l->list + index; + *pos = *iter; + return iter; +} + +static void cgroup_pidlist_stop(struct seq_file *s, void *v) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + + if (l) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, + CGROUP_PIDLIST_DESTROY_DELAY); + mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); +} + +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + pid_t *p = v; + pid_t *end = l->list + l->length; + /* + * Advance to the next pid in the array. If this goes off the + * end, we're done + */ + p++; + if (p >= end) { + return NULL; + } else { + *pos = *p; + return p; + } +} + +static int cgroup_pidlist_show(struct seq_file *s, void *v) +{ + seq_printf(s, "%d\n", *(int *)v); + + return 0; +} + +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup_procs_write(of, buf, nbytes, off, false); +} + +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + spin_lock(&release_agent_path_lock); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); + spin_unlock(&release_agent_path_lock); + cgroup_kn_unlock(of->kn); + return nbytes; +} + +static int cgroup_release_agent_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + spin_lock(&release_agent_path_lock); + seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); + seq_putc(seq, '\n'); + return 0; +} + +static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) +{ + seq_puts(seq, "0\n"); + return 0; +} + +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return notify_on_release(css->cgroup); +} + +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + else + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + return 0; +} + +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); +} + +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + else + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + return 0; +} + +/* cgroup core interface files for the legacy hierarchies */ +struct cftype cgroup1_base_files[] = { + { + .name = "cgroup.procs", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, + .write = cgroup_procs_write, + }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, + { + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_sane_behavior_show, + }, + { + .name = "tasks", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_TASKS, + .write = cgroup_tasks_write, + }, + { + .name = "notify_on_release", + .read_u64 = cgroup_read_notify_on_release, + .write_u64 = cgroup_write_notify_on_release, + }, + { + .name = "release_agent", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_release_agent_show, + .write = cgroup_release_agent_write, + .max_write_len = PATH_MAX - 1, + }, + { } /* terminate */ +}; + +/* Display information about each subsystem and each hierarchy */ +static int proc_cgroupstats_show(struct seq_file *m, void *v) +{ + struct cgroup_subsys *ss; + int i; + + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); + /* + * ideally we don't want subsystems moving around while we do this. + * cgroup_mutex is also necessary to guarantee an atomic snapshot of + * subsys/hierarchy state. + */ + mutex_lock(&cgroup_mutex); + + for_each_subsys(ss, i) + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->legacy_name, ss->root->hierarchy_id, + atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); + + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroupstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_cgroupstats_show, NULL); +} + +const struct file_operations proc_cgroupstats_operations = { + .open = cgroupstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * cgroupstats_build - build and fill cgroupstats + * @stats: cgroupstats to fill information into + * @dentry: A dentry entry belonging to the cgroup for which stats have + * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. + */ +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) +{ + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup *cgrp; + struct css_task_iter it; + struct task_struct *tsk; + + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; + + mutex_lock(&cgroup_mutex); + + /* + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_online_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. + */ + rcu_read_lock(); + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + if (!cgrp || cgroup_is_dead(cgrp)) { + rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); + return -ENOENT; + } + rcu_read_unlock(); + + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + switch (tsk->state) { + case TASK_RUNNING: + stats->nr_running++; + break; + case TASK_INTERRUPTIBLE: + stats->nr_sleeping++; + break; + case TASK_UNINTERRUPTIBLE: + stats->nr_uninterruptible++; + break; + case TASK_STOPPED: + stats->nr_stopped++; + break; + default: + if (delayacct_is_task_waiting_on_io(tsk)) + stats->nr_io_wait++; + break; + } + } + css_task_iter_end(&it); + + mutex_unlock(&cgroup_mutex); + return 0; +} + +void cgroup1_check_for_release(struct cgroup *cgrp) +{ + if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && + !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) + schedule_work(&cgrp->release_agent_work); +} + +/* + * Notify userspace when a cgroup is released, by running the + * configured release agent with the name of the cgroup (path + * relative to the root of cgroup file system) as the argument. + * + * Most likely, this user command will try to rmdir this cgroup. + * + * This races with the possibility that some other task will be + * attached to this cgroup before it is removed, or that some other + * user task will 'mkdir' a child cgroup of this cgroup. That's ok. + * The presumed 'rmdir' will fail quietly if this cgroup is no longer + * unused, and this cgroup will be reprieved from its death sentence, + * to continue to serve a useful existence. Next time it's released, + * we will get notified again, if it still has 'notify_on_release' set. + * + * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which + * means only wait until the task is successfully execve()'d. The + * separate release agent task is forked by call_usermodehelper(), + * then control in this thread returns here, without waiting for the + * release agent task. We don't bother to wait because the caller of + * this routine has no use for the exit status of the release agent + * task, so no sense holding our caller up for that. + */ +void cgroup1_release_agent(struct work_struct *work) +{ + struct cgroup *cgrp = + container_of(work, struct cgroup, release_agent_work); + char *pathbuf = NULL, *agentbuf = NULL; + char *argv[3], *envp[3]; + int ret; + + mutex_lock(&cgroup_mutex); + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out; + + spin_lock_irq(&css_set_lock); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_irq(&css_set_lock); + if (ret < 0 || ret >= PATH_MAX) + goto out; + + argv[0] = agentbuf; + argv[1] = pathbuf; + argv[2] = NULL; + + /* minimal command environment */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + mutex_unlock(&cgroup_mutex); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + goto out_free; +out: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(agentbuf); + kfree(pathbuf); +} + +/* + * cgroup_rename - Only allow simple rename of directories in place. + */ +static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) +{ + struct cgroup *cgrp = kn->priv; + int ret; + + if (kernfs_type(kn) != KERNFS_DIR) + return -ENOTDIR; + if (kn->parent != new_parent) + return -EIO; + + /* + * We're gonna grab cgroup_mutex which nests outside kernfs + * active_ref. kernfs_rename() doesn't require active_ref + * protection. Break them before grabbing cgroup_mutex. + */ + kernfs_break_active_protection(new_parent); + kernfs_break_active_protection(kn); + + mutex_lock(&cgroup_mutex); + + ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); + + mutex_unlock(&cgroup_mutex); + + kernfs_unbreak_active_protection(kn); + kernfs_unbreak_active_protection(new_parent); + return ret; +} + +static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) +{ + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_subsys *ss; + int ssid; + + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_show_option(seq, ss->legacy_name, NULL); + if (root->flags & CGRP_ROOT_NOPREFIX) + seq_puts(seq, ",noprefix"); + if (root->flags & CGRP_ROOT_XATTR) + seq_puts(seq, ",xattr"); + + spin_lock(&release_agent_path_lock); + if (strlen(root->release_agent_path)) + seq_show_option(seq, "release_agent", + root->release_agent_path); + spin_unlock(&release_agent_path_lock); + + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) + seq_puts(seq, ",clone_children"); + if (strlen(root->name)) + seq_show_option(seq, "name", root->name); + return 0; +} + +static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) +{ + char *token, *o = data; + bool all_ss = false, one_ss = false; + u16 mask = U16_MAX; + struct cgroup_subsys *ss; + int nr_opts = 0; + int i; + +#ifdef CONFIG_CPUSETS + mask = ~((u16)1 << cpuset_cgrp_id); +#endif + + memset(opts, 0, sizeof(*opts)); + + while ((token = strsep(&o, ",")) != NULL) { + nr_opts++; + + if (!*token) + return -EINVAL; + if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { + opts->flags |= CGRP_ROOT_NOPREFIX; + continue; + } + if (!strcmp(token, "clone_children")) { + opts->cpuset_clone_children = true; + continue; + } + if (!strcmp(token, "xattr")) { + opts->flags |= CGRP_ROOT_XATTR; + continue; + } + if (!strncmp(token, "release_agent=", 14)) { + /* Specifying two release agents is forbidden */ + if (opts->release_agent) + return -EINVAL; + opts->release_agent = + kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); + if (!opts->release_agent) + return -ENOMEM; + continue; + } + if (!strncmp(token, "name=", 5)) { + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN - 1, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; + + continue; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->legacy_name)) + continue; + if (!cgroup_ssid_enabled(i)) + continue; + if (cgroup1_ssid_disabled(i)) + continue; + + /* Mutually exclusive option 'all' + subsystem name */ + if (all_ss) + return -EINVAL; + opts->subsys_mask |= (1 << i); + one_ss = true; + + break; + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; + } + + /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options were + * not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + opts->subsys_mask |= (1 << i); + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; + + /* + * Option noprefix was introduced just for backward compatibility + * with the old cpuset, so we allow noprefix only if mounting just + * the cpuset subsystem. + */ + if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) + return -EINVAL; + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_mask && opts->none) + return -EINVAL; + + return 0; +} + +static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) +{ + int ret = 0; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_sb_opts opts; + u16 added_mask, removed_mask; + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* See what subsystems are wanted */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); + + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; + + /* Don't allow flags or name to change at remount */ + if ((opts.flags ^ root->flags) || + (opts.name && strcmp(opts.name, root->name))) { + pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", + opts.flags, opts.name ?: "", root->flags, root->name); + ret = -EINVAL; + goto out_unlock; + } + + /* remounting is not allowed for populated hierarchies */ + if (!list_empty(&root->cgrp.self.children)) { + ret = -EBUSY; + goto out_unlock; + } + + ret = rebind_subsystems(root, added_mask); + if (ret) + goto out_unlock; + + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); + + if (opts.release_agent) { + spin_lock(&release_agent_path_lock); + strcpy(root->release_agent_path, opts.release_agent); + spin_unlock(&release_agent_path_lock); + } + + trace_cgroup_remount(root); + + out_unlock: + kfree(opts.release_agent); + kfree(opts.name); + mutex_unlock(&cgroup_mutex); + return ret; +} + +struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { + .rename = cgroup1_rename, + .show_options = cgroup1_show_options, + .remount_fs = cgroup1_remount, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .show_path = cgroup_show_path, +}; + +struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, + void *data, unsigned long magic, + struct cgroup_namespace *ns) +{ + struct super_block *pinned_sb = NULL; + struct cgroup_sb_opts opts; + struct cgroup_root *root; + struct cgroup_subsys *ss; + struct dentry *dentry; + int i, ret; + bool new_root = false; + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* First find the desired set of subsystems */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + /* + * Destruction of cgroup root is asynchronous, so subsystems may + * still be dying after the previous unmount. Let's drain the + * dying subsystems. We just need to ensure that the ones + * unmounted previously finish dying and don't care about new ones + * starting. Testing ref liveliness is good enough. + */ + for_each_subsys(ss, i) { + if (!(opts.subsys_mask & (1 << i)) || + ss->root == &cgrp_dfl_root) + continue; + + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + cgroup_put(&ss->root->cgrp); + } + + for_each_root(root) { + bool name_match = false; + + if (root == &cgrp_dfl_root) + continue; + + /* + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. + */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } + + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. + */ + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } + + if (root->flags ^ opts.flags) + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); + + /* + * We want to reuse @root whose lifetime is governed by its + * ->cgrp. Let's check whether @root is alive and keep it + * that way. As cgroup_kill_sb() can happen anytime, we + * want to block it by pinning the sb so that @root doesn't + * get killed before mount is complete. + * + * With the sb pinned, tryget_live can reliably indicate + * whether @root can be reused. If it's being killed, + * drain it. We can use wait_queue for the wait but this + * path is super cold. Let's just sleep a bit and retry. + */ + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); + if (IS_ERR(pinned_sb) || + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + if (!IS_ERR_OR_NULL(pinned_sb)) + deactivate_super(pinned_sb); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + + ret = 0; + goto out_unlock; + } + + /* + * No such thing, create a new one. name= matching without subsys + * specification is allowed for already existing hierarchies but we + * can't create new one without subsys specification. + */ + if (!opts.subsys_mask && !opts.none) { + ret = -EINVAL; + goto out_unlock; + } + + /* Hierarchies may only be created in the initial cgroup namespace. */ + if (ns != &init_cgroup_ns) { + ret = -EPERM; + goto out_unlock; + } + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) { + ret = -ENOMEM; + goto out_unlock; + } + new_root = true; + + init_cgroup_root(root, &opts); + + ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); + if (ret) + cgroup_free_root(root); + +out_unlock: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(opts.release_agent); + kfree(opts.name); + + if (ret) + return ERR_PTR(ret); + + dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, + CGROUP_SUPER_MAGIC, ns); + + /* + * There's a race window after we release cgroup_mutex and before + * allocating a superblock. Make sure a concurrent process won't + * be able to re-use the root during this window by delaying the + * initialization of root refcnt. + */ + if (new_root) { + mutex_lock(&cgroup_mutex); + percpu_ref_reinit(&root->cgrp.self.refcnt); + mutex_unlock(&cgroup_mutex); + } + + /* + * If @pinned_sb, we're reusing an existing root and holding an + * extra ref on its sb. Mount is complete. Put the extra ref. + */ + if (pinned_sb) + deactivate_super(pinned_sb); + + return dentry; +} + +static int __init cgroup1_wq_init(void) +{ + /* + * Used to destroy pidlists and separate to serve as flush domain. + * Cap @max_active to 1 too. + */ + cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", + 0, 1); + BUG_ON(!cgroup_pidlist_destroy_wq); + return 0; +} +core_initcall(cgroup1_wq_init); + +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = U16_MAX; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + + +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_css_free(struct cgroup_subsys_state *css) +{ + kfree(css); +} + +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return cgroup_task_count(css->cgroup); +} + +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = refcount_read(&task_css_set(current)->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; + + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + cgroup_name(c, name_buf, NAME_MAX + 1); + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name_buf); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + kfree(name_buf); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct seq_file *seq, void *v) +{ + struct cgroup_subsys_state *css = seq_css(seq); + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { + struct css_set *cset = link->cset; + struct task_struct *task; + int count = 0; + + seq_printf(seq, "css_set %pK\n", cset); + + list_for_each_entry(task, &cset->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + continue; + overflow: + seq_puts(seq, " ...\n"); + } + spin_unlock_irq(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return (!cgroup_is_populated(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); +} + +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, + + { } /* terminate */ +}; + +struct cgroup_subsys debug_cgrp_subsys = { + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup.c b/kernel/cgroup/cgroup.c index 53bbca7c4859..c3c9a0e1b3c9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -28,35 +28,27 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/cgroup.h> +#include "cgroup-internal.h" + #include <linux/cred.h> -#include <linux/ctype.h> #include <linux/errno.h> #include <linux/init_task.h> #include <linux/kernel.h> -#include <linux/list.h> #include <linux/magic.h> -#include <linux/mm.h> #include <linux/mutex.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/task.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/percpu-rwsem.h> #include <linux/string.h> -#include <linux/sort.h> -#include <linux/kmod.h> -#include <linux/delayacct.h> -#include <linux/cgroupstats.h> #include <linux/hashtable.h> -#include <linux/pid_namespace.h> #include <linux/idr.h> -#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/kthread.h> -#include <linux/delay.h> #include <linux/atomic.h> #include <linux/cpuset.h> #include <linux/proc_ns.h> @@ -67,14 +59,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/cgroup.h> -/* - * pidlists linger the following amount before being destroyed. The goal - * is avoiding frequent destruction in the middle of consecutive read calls - * Expiring in the middle is a performance problem not a correctness one. - * 1 sec should be enough. - */ -#define CGROUP_PIDLIST_DESTROY_DELAY HZ - #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) @@ -88,14 +72,12 @@ * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ -#ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); + +#ifdef CONFIG_PROVE_RCU EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); -#else -static DEFINE_MUTEX(cgroup_mutex); -static DEFINE_SPINLOCK(css_set_lock); #endif /* @@ -110,12 +92,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); -/* - * Protects cgroup_subsys->release_agent_path. Modifying it also requires - * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. - */ -static DEFINE_SPINLOCK(release_agent_path_lock); - struct percpu_rw_semaphore cgroup_threadgroup_rwsem; #define cgroup_assert_mutex_or_rcu_locked() \ @@ -131,15 +107,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem; */ static struct workqueue_struct *cgroup_destroy_wq; -/* - * pidlist destructions need to be flushed on cgroup destruction. Use a - * separate workqueue as flush domain. - */ -static struct workqueue_struct *cgroup_pidlist_destroy_wq; - /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, -static struct cgroup_subsys *cgroup_subsys[] = { +struct cgroup_subsys *cgroup_subsys[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS @@ -186,18 +156,14 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); */ static bool cgrp_dfl_visible; -/* Controllers blocked by the commandline in v1 */ -static u16 cgroup_no_v1_mask; - /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ -static unsigned long cgrp_dfl_implicit_ss_mask; +static u16 cgrp_dfl_implicit_ss_mask; /* The list of hierarchy roots */ - -static LIST_HEAD(cgroup_roots); +LIST_HEAD(cgroup_roots); static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ @@ -213,32 +179,26 @@ static DEFINE_IDR(cgroup_hierarchy_idr); static u64 css_serial_nr_next = 1; /* - * These bitmask flags indicate whether tasks in the fork and exit paths have - * fork/exit handlers to call. This avoids us having to do extra work in the - * fork/exit path to check which subsystems have fork/exit callbacks. + * These bitmasks identify subsystems with specific features to avoid + * having to do iterative checks repeatedly. */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; static u16 have_free_callback __read_mostly; +static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .count = { .counter = 2, }, + .count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, .root_cset = &init_css_set, }; -/* Ditto for the can_fork callback. */ -static u16 have_canfork_callback __read_mostly; - static struct file_system_type cgroup2_fs_type; -static struct cftype cgroup_dfl_base_files[]; -static struct cftype cgroup_legacy_base_files[]; +static struct cftype cgroup_base_files[]; -static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); -static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_advance(struct css_task_iter *it); @@ -259,7 +219,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid. */ -static bool cgroup_ssid_enabled(int ssid) +bool cgroup_ssid_enabled(int ssid) { if (CGROUP_SUBSYS_COUNT == 0) return false; @@ -267,11 +227,6 @@ static bool cgroup_ssid_enabled(int ssid) return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } -static bool cgroup_ssid_no_v1(int ssid) -{ - return cgroup_no_v1_mask & (1 << ssid); -} - /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest @@ -325,7 +280,7 @@ static bool cgroup_ssid_no_v1(int ssid) * * - debug: disallowed on the default hierarchy. */ -static bool cgroup_on_dfl(const struct cgroup *cgrp) +bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } @@ -481,13 +436,12 @@ out_unlock: return css; } -/* convenient tests for these bits */ -static inline bool cgroup_is_dead(const struct cgroup *cgrp) +static void __maybe_unused cgroup_get(struct cgroup *cgrp) { - return !(cgrp->self.flags & CSS_ONLINE); + css_get(&cgrp->self); } -static void cgroup_get(struct cgroup *cgrp) +static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); css_get(&cgrp->self); @@ -518,11 +472,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); -static int notify_on_release(const struct cgroup *cgrp) -{ - return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); -} - /** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor @@ -553,15 +502,6 @@ static int notify_on_release(const struct cgroup *cgrp) else /** - * for_each_subsys - iterate all enabled cgroup subsystems - * @ss: the iteration cursor - * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - */ -#define for_each_subsys(ss, ssid) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ - (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) - -/** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end @@ -585,10 +525,6 @@ static int notify_on_release(const struct cgroup *cgrp) } \ } while (false) -/* iterate across the hierarchies */ -#define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) - /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ @@ -615,29 +551,6 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else -static void cgroup_release_agent(struct work_struct *work); -static void check_for_release(struct cgroup *cgrp); - -/* - * A cgroup can be associated with multiple css_sets as different tasks may - * belong to different cgroups on different hierarchies. In the other - * direction, a css_set is naturally associated with multiple cgroups. - * This M:N relationship is represented by the following link structure - * which exists for each association and allows traversing the associations - * from both sides. - */ -struct cgrp_cset_link { - /* the cgroup and css_set this link associates */ - struct cgroup *cgrp; - struct css_set *cset; - - /* list of cgrp_cset_links anchored at cgrp->cset_links */ - struct list_head cset_link; - - /* list of cgrp_cset_links anchored at css_set->cgrp_links */ - struct list_head cgrp_link; -}; - /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state @@ -646,13 +559,13 @@ struct cgrp_cset_link { * haven't been created. */ struct css_set init_css_set = { - .refcount = ATOMIC_INIT(1), - .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), + .refcount = REFCOUNT_INIT(1), .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), + .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), - .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), }; static int css_set_count = 1; /* 1 for init_css_set */ @@ -699,7 +612,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (!trigger) break; - check_for_release(cgrp); + cgroup1_check_for_release(cgrp); cgroup_file_notify(&cgrp->events_file); cgrp = cgroup_parent(cgrp); @@ -808,7 +721,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -static void put_css_set_locked(struct css_set *cset) +void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; @@ -816,7 +729,7 @@ static void put_css_set_locked(struct css_set *cset) lockdep_assert_held(&css_set_lock); - if (!atomic_dec_and_test(&cset->refcount)) + if (!refcount_dec_and_test(&cset->refcount)) return; /* This css_set is dead. unlink it and release cgroup and css refs */ @@ -838,31 +751,6 @@ static void put_css_set_locked(struct css_set *cset) kfree_rcu(cset, rcu_head); } -static void put_css_set(struct css_set *cset) -{ - unsigned long flags; - - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (atomic_add_unless(&cset->refcount, -1, 1)) - return; - - spin_lock_irqsave(&css_set_lock, flags); - put_css_set_locked(cset); - spin_unlock_irqrestore(&css_set_lock, flags); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cset) -{ - atomic_inc(&cset->refcount); -} - /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested @@ -1049,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) - cgroup_get(cgrp); + cgroup_get_live(cgrp); } /** @@ -1094,14 +982,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, return NULL; } - atomic_set(&cset->refcount, 1); - INIT_LIST_HEAD(&cset->cgrp_links); + refcount_set(&cset->refcount, 1); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); - INIT_LIST_HEAD(&cset->mg_preload_node); - INIT_LIST_HEAD(&cset->mg_node); INIT_LIST_HEAD(&cset->task_iters); INIT_HLIST_NODE(&cset->hlist); + INIT_LIST_HEAD(&cset->cgrp_links); + INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ @@ -1138,7 +1026,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } -static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) +struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; @@ -1166,7 +1054,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } -static void cgroup_free_root(struct cgroup_root *root) +void cgroup_free_root(struct cgroup_root *root) { if (root) { idr_destroy(&root->cgroup_idr); @@ -1283,8 +1171,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex and css_set_lock held. */ -static struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroup_root *root) +struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root) { /* * No need to lock the task - since we hold cgroup_mutex the @@ -1321,7 +1209,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, */ static struct kernfs_syscall_ops cgroup_kf_syscall_ops; -static const struct file_operations proc_cgroupstats_operations; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) @@ -1415,7 +1302,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function. */ -static void cgroup_kn_unlock(struct kernfs_node *kn) +void cgroup_kn_unlock(struct kernfs_node *kn) { struct cgroup *cgrp; @@ -1447,8 +1334,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ -static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, - bool drain_offline) +struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) { struct cgroup *cgrp; @@ -1532,9 +1418,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (!css->ss) { if (cgroup_on_dfl(cgrp)) - cfts = cgroup_dfl_base_files; + cfts = cgroup_base_files; else - cfts = cgroup_legacy_base_files; + cfts = cgroup1_base_files; return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); } @@ -1559,7 +1445,7 @@ err: return ret; } -static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) +int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; @@ -1629,8 +1515,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) return 0; } -static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, - struct kernfs_root *kf_root) +int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; @@ -1656,237 +1542,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int cgroup_show_options(struct seq_file *seq, - struct kernfs_root *kf_root) -{ - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_subsys *ss; - int ssid; - - if (root != &cgrp_dfl_root) - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_show_option(seq, ss->legacy_name, NULL); - if (root->flags & CGRP_ROOT_NOPREFIX) - seq_puts(seq, ",noprefix"); - if (root->flags & CGRP_ROOT_XATTR) - seq_puts(seq, ",xattr"); - - spin_lock(&release_agent_path_lock); - if (strlen(root->release_agent_path)) - seq_show_option(seq, "release_agent", - root->release_agent_path); - spin_unlock(&release_agent_path_lock); - - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) - seq_puts(seq, ",clone_children"); - if (strlen(root->name)) - seq_show_option(seq, "name", root->name); - return 0; -} - -struct cgroup_sb_opts { - u16 subsys_mask; - unsigned int flags; - char *release_agent; - bool cpuset_clone_children; - char *name; - /* User explicitly requested empty subsystem */ - bool none; -}; - -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - u16 mask = U16_MAX; - struct cgroup_subsys *ss; - int nr_opts = 0; - int i; - -#ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); -#endif - - memset(opts, 0, sizeof(*opts)); - - while ((token = strsep(&o, ",")) != NULL) { - nr_opts++; - - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - opts->flags |= CGRP_ROOT_NOPREFIX; - continue; - } - if (!strcmp(token, "clone_children")) { - opts->cpuset_clone_children = true; - continue; - } - if (!strcmp(token, "xattr")) { - opts->flags |= CGRP_ROOT_XATTR; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; - - continue; - } - - for_each_subsys(ss, i) { - if (strcmp(token, ss->legacy_name)) - continue; - if (!cgroup_ssid_enabled(i)) - continue; - if (cgroup_ssid_no_v1(i)) - continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - opts->subsys_mask |= (1 << i); - one_ss = true; - - break; - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; - } - - /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options were - * not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) - opts->subsys_mask |= (1 << i); - - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; - - /* - * Option noprefix was introduced just for backward compatibility - * with the old cpuset, so we allow noprefix only if mounting just - * the cpuset subsystem. - */ - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) - return -EINVAL; - - /* Can't specify "none" and some subsystems */ - if (opts->subsys_mask && opts->none) - return -EINVAL; - - return 0; -} - static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { - int ret = 0; - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_sb_opts opts; - u16 added_mask, removed_mask; - - if (root == &cgrp_dfl_root) { - pr_err("remount is not allowed\n"); - return -EINVAL; - } - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", - task_tgid_nr(current), current->comm); - - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; - - /* Don't allow flags or name to change at remount */ - if ((opts.flags ^ root->flags) || - (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags, opts.name ?: "", root->flags, root->name); - ret = -EINVAL; - goto out_unlock; - } - - /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->cgrp.self.children)) { - ret = -EBUSY; - goto out_unlock; - } - - ret = rebind_subsystems(root, added_mask); - if (ret) - goto out_unlock; - - WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - - if (opts.release_agent) { - spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, opts.release_agent); - spin_unlock(&release_agent_path_lock); - } - - trace_cgroup_remount(root); - - out_unlock: - kfree(opts.release_agent); - kfree(opts.name); - mutex_unlock(&cgroup_mutex); - return ret; + pr_err("remount is not allowed\n"); + return -EINVAL; } /* @@ -1964,11 +1623,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); - INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); + INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } -static void init_cgroup_root(struct cgroup_root *root, - struct cgroup_sb_opts *opts) +void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) { struct cgroup *cgrp = &root->cgrp; @@ -1987,10 +1645,11 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; + struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; @@ -2002,8 +1661,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root_cgrp->id = ret; root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, - GFP_KERNEL); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, + ref_flags, GFP_KERNEL); if (ret) goto out; @@ -2022,7 +1681,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto cancel_ref; - root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, + kf_sops = root == &cgrp_dfl_root ? + &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; + + root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED, root_cgrp); if (IS_ERR(root->kf_root)) { @@ -2080,182 +1742,18 @@ out: return ret; } -static struct dentry *cgroup_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, + struct cgroup_root *root, unsigned long magic, + struct cgroup_namespace *ns) { - bool is_v2 = fs_type == &cgroup2_fs_type; - struct super_block *pinned_sb = NULL; - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct cgroup_subsys *ss; - struct cgroup_root *root; - struct cgroup_sb_opts opts; struct dentry *dentry; - int ret; - int i; bool new_sb; - get_cgroup_ns(ns); - - /* Check if the caller has permission to mount. */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { - put_cgroup_ns(ns); - return ERR_PTR(-EPERM); - } - - /* - * The first time anyone tries to mount a cgroup, enable the list - * linking each css_set to its tasks and fix up all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); - - if (is_v2) { - if (data) { - pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); - put_cgroup_ns(ns); - return ERR_PTR(-EINVAL); - } - cgrp_dfl_visible = true; - root = &cgrp_dfl_root; - cgroup_get(&root->cgrp); - goto out_mount; - } - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - /* - * Destruction of cgroup root is asynchronous, so subsystems may - * still be dying after the previous unmount. Let's drain the - * dying subsystems. We just need to ensure that the ones - * unmounted previously finish dying and don't care about new ones - * starting. Testing ref liveliness is good enough. - */ - for_each_subsys(ss, i) { - if (!(opts.subsys_mask & (1 << i)) || - ss->root == &cgrp_dfl_root) - continue; - - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - cgroup_put(&ss->root->cgrp); - } - - for_each_root(root) { - bool name_match = false; - - if (root == &cgrp_dfl_root) - continue; - - /* - * If we asked for a name then it must match. Also, if - * name matches but sybsys_mask doesn't, we should fail. - * Remember whether name matched. - */ - if (opts.name) { - if (strcmp(opts.name, root->name)) - continue; - name_match = true; - } - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match. - */ - if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { - if (!name_match) - continue; - ret = -EBUSY; - goto out_unlock; - } - - if (root->flags ^ opts.flags) - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - - /* - * We want to reuse @root whose lifetime is governed by its - * ->cgrp. Let's check whether @root is alive and keep it - * that way. As cgroup_kill_sb() can happen anytime, we - * want to block it by pinning the sb so that @root doesn't - * get killed before mount is complete. - * - * With the sb pinned, tryget_live can reliably indicate - * whether @root can be reused. If it's being killed, - * drain it. We can use wait_queue for the wait but this - * path is super cold. Let's just sleep a bit and retry. - */ - pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR(pinned_sb) || - !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - if (!IS_ERR_OR_NULL(pinned_sb)) - deactivate_super(pinned_sb); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - - ret = 0; - goto out_unlock; - } + dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); /* - * No such thing, create a new one. name= matching without subsys - * specification is allowed for already existing hierarchies but we - * can't create new one without subsys specification. - */ - if (!opts.subsys_mask && !opts.none) { - ret = -EINVAL; - goto out_unlock; - } - - /* Hierarchies may only be created in the initial cgroup namespace. */ - if (ns != &init_cgroup_ns) { - ret = -EPERM; - goto out_unlock; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - ret = -ENOMEM; - goto out_unlock; - } - - init_cgroup_root(root, &opts); - - ret = cgroup_setup_root(root, opts.subsys_mask); - if (ret) - cgroup_free_root(root); - -out_unlock: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(opts.release_agent); - kfree(opts.name); - - if (ret) { - put_cgroup_ns(ns); - return ERR_PTR(ret); - } -out_mount: - dentry = kernfs_mount(fs_type, flags, root->kf_root, - is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, - &new_sb); - - /* - * In non-init cgroup namespace, instead of root cgroup's - * dentry, we return the dentry corresponding to the - * cgroupns->root_cgrp. + * In non-init cgroup namespace, instead of root cgroup's dentry, + * we return the dentry corresponding to the cgroupns->root_cgrp. */ if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { struct dentry *nsdentry; @@ -2277,13 +1775,45 @@ out_mount: if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); + return dentry; +} + +static struct dentry *cgroup_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data) +{ + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct dentry *dentry; + + get_cgroup_ns(ns); + + /* Check if the caller has permission to mount. */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { + put_cgroup_ns(ns); + return ERR_PTR(-EPERM); + } + /* - * If @pinned_sb, we're reusing an existing root and holding an - * extra ref on its sb. Mount is complete. Put the extra ref. + * The first time anyone tries to mount a cgroup, enable the list + * linking each css_set to its tasks and fix up all existing tasks. */ - if (pinned_sb) { - WARN_ON(new_sb); - deactivate_super(pinned_sb); + if (!use_task_css_set_links) + cgroup_enable_task_cg_lists(); + + if (fs_type == &cgroup2_fs_type) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + put_cgroup_ns(ns); + return ERR_PTR(-EINVAL); + } + cgrp_dfl_visible = true; + cgroup_get_live(&cgrp_dfl_root.cgrp); + + dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, + CGROUP2_SUPER_MAGIC, ns); + } else { + dentry = cgroup1_mount(&cgroup_fs_type, flags, data, + CGROUP_SUPER_MAGIC, ns); } put_cgroup_ns(ns); @@ -2311,7 +1841,7 @@ static void cgroup_kill_sb(struct super_block *sb) kernfs_kill_sb(sb); } -static struct file_system_type cgroup_fs_type = { +struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, @@ -2325,8 +1855,8 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); @@ -2389,49 +1919,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); -/* used to track tasks and other necessary states during migration */ -struct cgroup_taskset { - /* the src and dst cset list running through cset->mg_node */ - struct list_head src_csets; - struct list_head dst_csets; - - /* the subsys currently being processed */ - int ssid; - - /* - * Fields for cgroup_taskset_*() iteration. - * - * Before migration is committed, the target migration tasks are on - * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of - * the csets on ->dst_csets. ->csets point to either ->src_csets - * or ->dst_csets depending on whether migration is committed. - * - * ->cur_csets and ->cur_task point to the current task position - * during iteration. - */ - struct list_head *csets; - struct css_set *cur_cset; - struct task_struct *cur_task; -}; - -#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ - .src_csets = LIST_HEAD_INIT(tset.src_csets), \ - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ - .csets = &tset.src_csets, \ -} - /** - * cgroup_taskset_add - try to add a migration target task to a taskset + * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task - * @tset: target taskset + * @mgctx: target migration context * - * Add @task, which is a migration target, to @tset. This function becomes - * noop if @task doesn't need to be migrated. @task's css_set should have - * been added as a migration source and @task->cg_list will be moved from - * the css_set's tasks list to mg_tasks one. + * Add @task, which is a migration target, to @mgctx->tset. This function + * becomes noop if @task doesn't need to be migrated. @task's css_set + * should have been added as a migration source and @task->cg_list will be + * moved from the css_set's tasks list to mg_tasks one. */ -static void cgroup_taskset_add(struct task_struct *task, - struct cgroup_taskset *tset) +static void cgroup_migrate_add_task(struct task_struct *task, + struct cgroup_mgctx *mgctx) { struct css_set *cset; @@ -2451,10 +1950,11 @@ static void cgroup_taskset_add(struct task_struct *task, list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) - list_add_tail(&cset->mg_node, &tset->src_csets); + list_add_tail(&cset->mg_node, + &mgctx->tset.src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) - list_move_tail(&cset->mg_dst_cset->mg_node, - &tset->dst_csets); + list_add_tail(&cset->mg_dst_cset->mg_node, + &mgctx->tset.dst_csets); } /** @@ -2521,17 +2021,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /** * cgroup_taskset_migrate - migrate a taskset - * @tset: taget taskset - * @root: cgroup root the migration is taking place on + * @mgctx: migration context * - * Migrate tasks in @tset as setup by migration preparation functions. + * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and - * guarantees that either all or none of the tasks in @tset are migrated. - * @tset is consumed regardless of success. + * guarantees that either all or none of the tasks in @mgctx are migrated. + * @mgctx is consumed regardless of success. */ -static int cgroup_taskset_migrate(struct cgroup_taskset *tset, - struct cgroup_root *root) +static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) { + struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; @@ -2542,7 +2041,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, return 0; /* check that we can legitimately attach to the cgroup */ - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->can_attach) { tset->ssid = ssid; ret = ss->can_attach(tset); @@ -2578,7 +2077,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) { tset->ssid = ssid; ss->attach(tset); @@ -2589,7 +2088,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, goto out_release_tset; out_cancel_attach: - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ssid == failed_ssid) break; if (ss->cancel_attach) { @@ -2616,7 +2115,7 @@ out_release_tset: * zero for migration destination cgroups with tasks so that child cgroups * don't compete against tasks. */ -static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) { return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || !dst_cgrp->subtree_control; @@ -2624,25 +2123,31 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) /** * cgroup_migrate_finish - cleanup after attach - * @preloaded_csets: list of preloaded css_sets + * @mgctx: migration context * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ -static void cgroup_migrate_finish(struct list_head *preloaded_csets) +void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { + LIST_HEAD(preloaded); struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { + + list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); + list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); + + list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } + spin_unlock_irq(&css_set_lock); } @@ -2650,10 +2155,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup - * @preloaded_csets: list of preloaded css_sets + * @mgctx: migration context * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin - * @src_cset and add it to @preloaded_csets, which should later be cleaned + * @src_cset and add it to @mgctx->src_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem @@ -2662,9 +2167,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * into play and the preloaded css_sets are guaranteed to cover all * migrations. */ -static void cgroup_migrate_add_src(struct css_set *src_cset, - struct cgroup *dst_cgrp, - struct list_head *preloaded_csets) +void cgroup_migrate_add_src(struct css_set *src_cset, + struct cgroup *dst_cgrp, + struct cgroup_mgctx *mgctx) { struct cgroup *src_cgrp; @@ -2692,33 +2197,35 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add(&src_cset->mg_preload_node, preloaded_csets); + list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @preloaded_csets: list of preloaded source css_sets + * @mgctx: migration context * * Tasks are about to be moved and all the source css_sets have been - * preloaded to @preloaded_csets. This function looks up and pins all - * destination css_sets, links each to its source, and append them to - * @preloaded_csets. + * preloaded to @mgctx->preloaded_src_csets. This function looks up and + * pins all destination css_sets, links each to its source, and append them + * to @mgctx->preloaded_dst_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on - * @preloaded_csets. + * @mgctx. */ -static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) +int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) { - LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ - list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { + list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, + mg_preload_node) { struct css_set *dst_cset; + struct cgroup_subsys *ss; + int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) @@ -2743,15 +2250,19 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_preload_node)) - list_add(&dst_cset->mg_preload_node, &csets); + list_add_tail(&dst_cset->mg_preload_node, + &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); + + for_each_subsys(ss, ssid) + if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) + mgctx->ss_mask |= 1 << ssid; } - list_splice_tail(&csets, preloaded_csets); return 0; err: - cgroup_migrate_finish(&csets); + cgroup_migrate_finish(mgctx); return -ENOMEM; } @@ -2759,7 +2270,7 @@ err: * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @root: cgroup root migration is taking place on + * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also @@ -2773,10 +2284,9 @@ err: * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ -static int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_root *root) +int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup_mgctx *mgctx) { - struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; /* @@ -2788,14 +2298,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_lock(); task = leader; do { - cgroup_taskset_add(task, &tset); + cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - return cgroup_taskset_migrate(&tset, root); + return cgroup_migrate_execute(mgctx); } /** @@ -2806,10 +2316,10 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ -static int cgroup_attach_task(struct cgroup *dst_cgrp, - struct task_struct *leader, bool threadgroup) +int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, + bool threadgroup) { - LIST_HEAD(preloaded_csets); + DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; int ret; @@ -2821,8 +2331,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, rcu_read_lock(); task = leader; do { - cgroup_migrate_add_src(task_css_set(task), dst_cgrp, - &preloaded_csets); + cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); @@ -2830,11 +2339,11 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(&preloaded_csets); + ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) - ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); + ret = cgroup_migrate(leader, threadgroup, &mgctx); - cgroup_migrate_finish(&preloaded_csets); + cgroup_migrate_finish(&mgctx); if (!ret) trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); @@ -2846,20 +2355,9 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) { - const struct cred *cred = current_cred(); - const struct cred *tcred = get_task_cred(task); int ret = 0; - /* - * even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. - */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) - ret = -EACCES; - - if (!ret && cgroup_on_dfl(dst_cgrp)) { + if (cgroup_on_dfl(dst_cgrp)) { struct super_block *sb = of->file->f_path.dentry->d_sb; struct cgroup *cgrp; struct inode *inode; @@ -2877,9 +2375,21 @@ static int cgroup_procs_write_permission(struct task_struct *task, ret = inode_permission(inode, MAY_WRITE); iput(inode); } + } else { + const struct cred *cred = current_cred(); + const struct cred *tcred = get_task_cred(task); + + /* + * even if we're attaching all tasks in the thread group, + * we only need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EACCES; + put_cred(tcred); } - put_cred(tcred); return ret; } @@ -2888,8 +2398,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, * function to attach either it or all tasks in its threadgroup. Will lock * cgroup_mutex and threadgroup. */ -static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off, bool threadgroup) +ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; struct cgroup_subsys *ss; @@ -2920,11 +2430,12 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, tsk = tsk->group_leader; /* - * Workqueue threads may acquire PF_NO_SETAFFINITY and become - * trapped in a cpuset, or RT worker may be born in a cgroup - * with no rt_runtime allocated. Just say no. + * kthreads may acquire PF_NO_SETAFFINITY during initialization. + * If userland migrates such a kthread to a non-root cgroup, it can + * become trapped in a cpuset, or RT kthread may be born in a + * cgroup with no rt_runtime allocated. Just say no. */ - if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { + if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out_unlock_rcu; } @@ -2950,86 +2461,12 @@ out_unlock_threadgroup: return ret ?: nbytes; } -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ - struct cgroup_root *root; - int retval = 0; - - mutex_lock(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - for_each_root(root) { - struct cgroup *from_cgrp; - - if (root == &cgrp_dfl_root) - continue; - - spin_lock_irq(&css_set_lock); - from_cgrp = task_cgroup_from_root(from, root); - spin_unlock_irq(&css_set_lock); - - retval = cgroup_attach_task(from_cgrp, tsk, false); - if (retval) - break; - } - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); - - return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - -static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - return __cgroup_procs_write(of, buf, nbytes, off, false); -} - -static ssize_t cgroup_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) { return __cgroup_procs_write(of, buf, nbytes, off, true); } -static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; - spin_lock(&release_agent_path_lock); - strlcpy(cgrp->root->release_agent_path, strstrip(buf), - sizeof(cgrp->root->release_agent_path)); - spin_unlock(&release_agent_path_lock); - cgroup_kn_unlock(of->kn); - return nbytes; -} - -static int cgroup_release_agent_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - spin_lock(&release_agent_path_lock); - seq_puts(seq, cgrp->root->release_agent_path); - spin_unlock(&release_agent_path_lock); - seq_putc(seq, '\n'); - return 0; -} - -static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) -{ - seq_puts(seq, "0\n"); - return 0; -} - static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; @@ -3075,8 +2512,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { - LIST_HEAD(preloaded_csets); - struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); + DEFINE_CGROUP_MGCTX(mgctx); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; @@ -3092,33 +2528,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgrp_cset_link *link; list_for_each_entry(link, &dsct->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, dsct, - &preloaded_csets); + cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(&preloaded_csets); + ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { + list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { struct task_struct *task, *ntask; - /* src_csets precede dst_csets, break on the first dst_cset */ - if (!src_cset->mg_src_cgrp) - break; - /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) - cgroup_taskset_add(task, &tset); + cgroup_migrate_add_task(task, &mgctx); } spin_unlock_irq(&css_set_lock); - ret = cgroup_taskset_migrate(&tset, cgrp->root); + ret = cgroup_migrate_execute(&mgctx); out_finish: - cgroup_migrate_finish(&preloaded_csets); + cgroup_migrate_finish(&mgctx); percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } @@ -3131,7 +2562,7 @@ out_finish: * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ -static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) +void cgroup_lock_and_drain_offline(struct cgroup *cgrp) __acquires(&cgroup_mutex) { struct cgroup *dsct; @@ -3150,7 +2581,7 @@ restart: if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; - cgroup_get(dsct); + cgroup_get_live(dsct); prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); @@ -3244,7 +2675,7 @@ static bool css_visible(struct cgroup_subsys_state *css) * * Returns 0 on success, -errno on failure. On failure, csses which have * been processed already aren't cleaned up. The caller is responsible for - * cleaning up with cgroup_apply_control_disble(). + * cleaning up with cgroup_apply_control_disable(). */ static int cgroup_apply_control_enable(struct cgroup *cgrp) { @@ -3503,6 +2934,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_file_open(struct kernfs_open_file *of) +{ + struct cftype *cft = of->kn->priv; + + if (cft->open) + return cft->open(of); + return 0; +} + +static void cgroup_file_release(struct kernfs_open_file *of) +{ + struct cftype *cft = of->kn->priv; + + if (cft->release) + cft->release(of); +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -3553,7 +3001,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - seq_cft(seq)->seq_stop(seq, v); + if (seq_cft(seq)->seq_stop) + seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) @@ -3575,12 +3024,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) static struct kernfs_ops cgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, + .open = cgroup_file_open, + .release = cgroup_file_release, .write = cgroup_file_write, .seq_show = cgroup_seqfile_show, }; static struct kernfs_ops cgroup_kf_ops = { .atomic_write_len = PAGE_SIZE, + .open = cgroup_file_open, + .release = cgroup_file_release, .write = cgroup_file_write, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, @@ -3588,48 +3041,6 @@ static struct kernfs_ops cgroup_kf_ops = { .seq_show = cgroup_seqfile_show, }; -/* - * cgroup_rename - Only allow simple rename of directories in place. - */ -static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str) -{ - struct cgroup *cgrp = kn->priv; - int ret; - - if (kernfs_type(kn) != KERNFS_DIR) - return -ENOTDIR; - if (kn->parent != new_parent) - return -EIO; - - /* - * This isn't a proper migration and its usefulness is very - * limited. Disallow on the default hierarchy. - */ - if (cgroup_on_dfl(cgrp)) - return -EPERM; - - /* - * We're gonna grab cgroup_mutex which nests outside kernfs - * active_ref. kernfs_rename() doesn't require active_ref - * protection. Break them before grabbing cgroup_mutex. - */ - kernfs_break_active_protection(new_parent); - kernfs_break_active_protection(kn); - - mutex_lock(&cgroup_mutex); - - ret = kernfs_rename(kn, new_parent, new_name_str); - if (!ret) - trace_cgroup_rename(cgrp); - - mutex_unlock(&cgroup_mutex); - - kernfs_unbreak_active_protection(kn); - kernfs_unbreak_active_protection(new_parent); - return ret; -} - /* set uid and gid of cgroup dirs and files to that of the creator */ static int cgroup_kn_set_ugid(struct kernfs_node *kn) { @@ -3926,26 +3337,6 @@ void cgroup_file_notify(struct cgroup_file *cfile) } /** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - * - * Return the number of tasks in the cgroup. The returned number can be - * higher than the actual number of tasks due to css_set references from - * namespace roots and temporary usages. - */ -static int cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += atomic_read(&link->cset->refcount); - spin_unlock_irq(&css_set_lock); - return count; -} - -/** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk @@ -4343,560 +3734,69 @@ void css_task_iter_end(struct css_task_iter *it) put_task_struct(it->cur_task); } -/** - * cgroup_trasnsfer_tasks - move tasks from one cgroup to another - * @to: cgroup to which the tasks will be moved - * @from: cgroup in which the tasks currently reside - * - * Locking rules between cgroup_post_fork() and the migration path - * guarantee that, if a task is forking while being migrated, the new child - * is guaranteed to be either visible in the source cgroup after the - * parent's migration is complete or put into the target cgroup. No task - * can slip out of migration through forking. - */ -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) -{ - LIST_HEAD(preloaded_csets); - struct cgrp_cset_link *link; - struct css_task_iter it; - struct task_struct *task; - int ret; - - if (!cgroup_may_migrate_to(to)) - return -EBUSY; - - mutex_lock(&cgroup_mutex); - - percpu_down_write(&cgroup_threadgroup_rwsem); - - /* all tasks in @from are being moved, all csets are source */ - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &from->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, to, &preloaded_csets); - spin_unlock_irq(&css_set_lock); - - ret = cgroup_migrate_prepare_dst(&preloaded_csets); - if (ret) - goto out_err; - - /* - * Migrate tasks one-by-one until @from is empty. This fails iff - * ->can_attach() fails. - */ - do { - css_task_iter_start(&from->self, &it); - task = css_task_iter_next(&it); - if (task) - get_task_struct(task); - css_task_iter_end(&it); - - if (task) { - ret = cgroup_migrate(task, false, to->root); - if (!ret) - trace_cgroup_transfer_tasks(to, task, false); - put_task_struct(task); - } - } while (task && !ret); -out_err: - cgroup_migrate_finish(&preloaded_csets); - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); - return ret; -} - -/* - * Stuff for reading the 'tasks'/'procs' files. - * - * Reading this file can return large amounts of data if a cgroup has - * *lots* of attached tasks. So it may need several calls to read(), - * but we cannot guarantee that the information we produce is correct - * unless we produce it entirely atomically. - * - */ - -/* which pidlist file are we talking about? */ -enum cgroup_filetype { - CGROUP_FILE_PROCS, - CGROUP_FILE_TASKS, -}; - -/* - * A pidlist is a list of pids that virtually represents the contents of one - * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, - * a pair (one each for procs, tasks) for each pid namespace that's relevant - * to the cgroup. - */ -struct cgroup_pidlist { - /* - * used to find which pidlist is wanted. doesn't change as long as - * this particular list stays in the list. - */ - struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; - /* array of xids */ - pid_t *list; - /* how many elements the above list has */ - int length; - /* each of these stored in a list by its cgroup */ - struct list_head links; - /* pointer to the cgroup we belong to, for list removal purposes */ - struct cgroup *owner; - /* for delayed destruction */ - struct delayed_work destroy_dwork; -}; - -/* - * The following two functions "fix" the issue where there are more pids - * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. - * TODO: replace with a kernel-wide solution to this problem - */ -#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) -static void *pidlist_allocate(int count) -{ - if (PIDLIST_TOO_LARGE(count)) - return vmalloc(count * sizeof(pid_t)); - else - return kmalloc(count * sizeof(pid_t), GFP_KERNEL); -} - -static void pidlist_free(void *p) -{ - kvfree(p); -} - -/* - * Used to destroy all pidlists lingering waiting for destroy timer. None - * should be left afterwards. - */ -static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) -{ - struct cgroup_pidlist *l, *tmp_l; - - mutex_lock(&cgrp->pidlist_mutex); - list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) - mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); - mutex_unlock(&cgrp->pidlist_mutex); - - flush_workqueue(cgroup_pidlist_destroy_wq); - BUG_ON(!list_empty(&cgrp->pidlists)); -} - -static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, - destroy_dwork); - struct cgroup_pidlist *tofree = NULL; - - mutex_lock(&l->owner->pidlist_mutex); - - /* - * Destroy iff we didn't get queued again. The state won't change - * as destroy_dwork can only be queued while locked. - */ - if (!delayed_work_pending(dwork)) { - list_del(&l->links); - pidlist_free(l->list); - put_pid_ns(l->key.ns); - tofree = l; - } - - mutex_unlock(&l->owner->pidlist_mutex); - kfree(tofree); -} - -/* - * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries - * Returns the number of unique elements. - */ -static int pidlist_uniq(pid_t *list, int length) -{ - int src, dest = 1; - - /* - * we presume the 0th element is unique, so i starts at 1. trivial - * edge cases first; no work needs to be done for either - */ - if (length == 0 || length == 1) - return length; - /* src and dest walk down the list; dest counts unique elements */ - for (src = 1; src < length; src++) { - /* find next unique element */ - while (list[src] == list[src-1]) { - src++; - if (src == length) - goto after; - } - /* dest always points to where the next unique element goes */ - list[dest] = list[src]; - dest++; - } -after: - return dest; -} - -/* - * The two pid files - task and cgroup.procs - guaranteed that the result - * is sorted, which forced this whole pidlist fiasco. As pid order is - * different per namespace, each namespace needs differently sorted list, - * making it impossible to use, for example, single rbtree of member tasks - * sorted by task pointer. As pidlists can be fairly large, allocating one - * per open file is dangerous, so cgroup had to implement shared pool of - * pidlists keyed by cgroup and namespace. - * - * All this extra complexity was caused by the original implementation - * committing to an entirely unnecessary property. In the long term, we - * want to do away with it. Explicitly scramble sort order if on the - * default hierarchy so that no such expectation exists in the new - * interface. - * - * Scrambling is done by swapping every two consecutive bits, which is - * non-identity one-to-one mapping which disturbs sort order sufficiently. - */ -static pid_t pid_fry(pid_t pid) -{ - unsigned a = pid & 0x55555555; - unsigned b = pid & 0xAAAAAAAA; - - return (a << 1) | (b >> 1); -} - -static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) +static void cgroup_procs_release(struct kernfs_open_file *of) { - if (cgroup_on_dfl(cgrp)) - return pid_fry(pid); - else - return pid; -} - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} - -static int fried_cmppid(const void *a, const void *b) -{ - return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); -} - -static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, - enum cgroup_filetype type) -{ - struct cgroup_pidlist *l; - /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = task_active_pid_ns(current); - - lockdep_assert_held(&cgrp->pidlist_mutex); - - list_for_each_entry(l, &cgrp->pidlists, links) - if (l->key.type == type && l->key.ns == ns) - return l; - return NULL; -} - -/* - * find the appropriate pidlist for our purpose (given procs vs tasks) - * returns with the lock on that pidlist already held, and takes care - * of the use count, or returns NULL with no locks held if we're out of - * memory. - */ -static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, - enum cgroup_filetype type) -{ - struct cgroup_pidlist *l; - - lockdep_assert_held(&cgrp->pidlist_mutex); - - l = cgroup_pidlist_find(cgrp, type); - if (l) - return l; - - /* entry not found; create a new one */ - l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); - if (!l) - return l; - - INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); - l->key.type = type; - /* don't need task_nsproxy() if we're looking at ourself */ - l->key.ns = get_pid_ns(task_active_pid_ns(current)); - l->owner = cgrp; - list_add(&l->links, &cgrp->pidlists); - return l; -} - -/* - * Load a cgroup's pidarray with either procs' tgids or tasks' pids - */ -static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, - struct cgroup_pidlist **lp) -{ - pid_t *array; - int length; - int pid, n = 0; /* used for populating the array */ - struct css_task_iter it; - struct task_struct *tsk; - struct cgroup_pidlist *l; - - lockdep_assert_held(&cgrp->pidlist_mutex); - - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - length = cgroup_task_count(cgrp); - array = pidlist_allocate(length); - if (!array) - return -ENOMEM; - /* now, populate the array */ - css_task_iter_start(&cgrp->self, &it); - while ((tsk = css_task_iter_next(&it))) { - if (unlikely(n == length)) - break; - /* get tgid or pid for procs or tasks file respectively */ - if (type == CGROUP_FILE_PROCS) - pid = task_tgid_vnr(tsk); - else - pid = task_pid_vnr(tsk); - if (pid > 0) /* make sure to only use valid results */ - array[n++] = pid; - } - css_task_iter_end(&it); - length = n; - /* now sort & (if procs) strip out duplicates */ - if (cgroup_on_dfl(cgrp)) - sort(array, length, sizeof(pid_t), fried_cmppid, NULL); - else - sort(array, length, sizeof(pid_t), cmppid, NULL); - if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(array, length); - - l = cgroup_pidlist_find_create(cgrp, type); - if (!l) { - pidlist_free(array); - return -ENOMEM; + if (of->priv) { + css_task_iter_end(of->priv); + kfree(of->priv); } - - /* store array, freeing old if necessary */ - pidlist_free(l->list); - l->list = array; - l->length = length; - *lp = l; - return 0; } -/** - * cgroupstats_build - build and fill cgroupstats - * @stats: cgroupstats to fill information into - * @dentry: A dentry entry belonging to the cgroup for which stats have - * been requested. - * - * Build and fill cgroupstats so that taskstats can export it to user - * space. - */ -int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) +static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { - struct kernfs_node *kn = kernfs_node_from_dentry(dentry); - struct cgroup *cgrp; - struct css_task_iter it; - struct task_struct *tsk; - - /* it should be kernfs_node belonging to cgroupfs and is a directory */ - if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || - kernfs_type(kn) != KERNFS_DIR) - return -EINVAL; - - mutex_lock(&cgroup_mutex); - - /* - * We aren't being called from kernfs and there's no guarantee on - * @kn->priv's validity. For this and css_tryget_online_from_dir(), - * @kn->priv is RCU safe. Let's do the RCU dancing. - */ - rcu_read_lock(); - cgrp = rcu_dereference(kn->priv); - if (!cgrp || cgroup_is_dead(cgrp)) { - rcu_read_unlock(); - mutex_unlock(&cgroup_mutex); - return -ENOENT; - } - rcu_read_unlock(); + struct kernfs_open_file *of = s->private; + struct css_task_iter *it = of->priv; + struct task_struct *task; - css_task_iter_start(&cgrp->self, &it); - while ((tsk = css_task_iter_next(&it))) { - switch (tsk->state) { - case TASK_RUNNING: - stats->nr_running++; - break; - case TASK_INTERRUPTIBLE: - stats->nr_sleeping++; - break; - case TASK_UNINTERRUPTIBLE: - stats->nr_uninterruptible++; - break; - case TASK_STOPPED: - stats->nr_stopped++; - break; - default: - if (delayacct_is_task_waiting_on_io(tsk)) - stats->nr_io_wait++; - break; - } - } - css_task_iter_end(&it); + do { + task = css_task_iter_next(it); + } while (task && !thread_group_leader(task)); - mutex_unlock(&cgroup_mutex); - return 0; + return task; } - -/* - * seq_file methods for the tasks/procs files. The seq_file position is the - * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->l->list array. - */ - -static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) +static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) { - /* - * Initially we receive a position value that corresponds to - * one more than the last pid shown (or 0 on the first call or - * after a seek to the start). Use a binary-search to find the - * next pid to display, if any - */ struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; - struct cgroup_pidlist *l; - enum cgroup_filetype type = seq_cft(s)->private; - int index = 0, pid = *pos; - int *iter, ret; - - mutex_lock(&cgrp->pidlist_mutex); + struct css_task_iter *it = of->priv; /* - * !NULL @of->priv indicates that this isn't the first start() - * after open. If the matching pidlist is around, we can use that. - * Look for it. Note that @of->priv can't be used directly. It - * could already have been destroyed. + * When a seq_file is seeked, it's always traversed sequentially + * from position 0, so we can simply keep iterating on !0 *pos. */ - if (of->priv) - of->priv = cgroup_pidlist_find(cgrp, type); - - /* - * Either this is the first start() after open or the matching - * pidlist has been destroyed inbetween. Create a new one. - */ - if (!of->priv) { - ret = pidlist_array_load(cgrp, type, - (struct cgroup_pidlist **)&of->priv); - if (ret) - return ERR_PTR(ret); - } - l = of->priv; - - if (pid) { - int end = l->length; - - while (index < end) { - int mid = (index + end) / 2; - if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { - index = mid; - break; - } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) - index = mid + 1; - else - end = mid; - } - } - /* If we're off the end of the array, we're done */ - if (index >= l->length) - return NULL; - /* Update the abstract position to be the actual pid that we found */ - iter = l->list + index; - *pos = cgroup_pid_fry(cgrp, *iter); - return iter; -} - -static void cgroup_pidlist_stop(struct seq_file *s, void *v) -{ - struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; - - if (l) - mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, - CGROUP_PIDLIST_DESTROY_DELAY); - mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); -} + if (!it) { + if (WARN_ON_ONCE((*pos)++)) + return ERR_PTR(-EINVAL); -static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; - pid_t *p = v; - pid_t *end = l->list + l->length; - /* - * Advance to the next pid in the array. If this goes off the - * end, we're done - */ - p++; - if (p >= end) { - return NULL; - } else { - *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); - return p; + it = kzalloc(sizeof(*it), GFP_KERNEL); + if (!it) + return ERR_PTR(-ENOMEM); + of->priv = it; + css_task_iter_start(&cgrp->self, it); + } else if (!(*pos)++) { + css_task_iter_end(it); + css_task_iter_start(&cgrp->self, it); } -} -static int cgroup_pidlist_show(struct seq_file *s, void *v) -{ - seq_printf(s, "%d\n", *(int *)v); - - return 0; + return cgroup_procs_next(s, NULL, NULL); } -static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, - struct cftype *cft) +static int cgroup_procs_show(struct seq_file *s, void *v) { - return notify_on_release(css->cgroup); -} - -static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); - else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); - return 0; -} - -static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); -} - -static int cgroup_clone_children_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - if (val) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); - else - clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + seq_printf(s, "%d\n", task_tgid_vnr(v)); return 0; } /* cgroup core interface files for the default hierarchy */ -static struct cftype cgroup_dfl_base_files[] = { +static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", .file_offset = offsetof(struct cgroup, procs_file), - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_PROCS, + .release = cgroup_procs_release, + .seq_start = cgroup_procs_start, + .seq_next = cgroup_procs_next, + .seq_show = cgroup_procs_show, .write = cgroup_procs_write, }, { @@ -4917,51 +3817,6 @@ static struct cftype cgroup_dfl_base_files[] = { { } /* terminate */ }; -/* cgroup core interface files for the legacy hierarchies */ -static struct cftype cgroup_legacy_base_files[] = { - { - .name = "cgroup.procs", - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_PROCS, - .write = cgroup_procs_write, - }, - { - .name = "cgroup.clone_children", - .read_u64 = cgroup_clone_children_read, - .write_u64 = cgroup_clone_children_write, - }, - { - .name = "cgroup.sane_behavior", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_sane_behavior_show, - }, - { - .name = "tasks", - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_TASKS, - .write = cgroup_tasks_write, - }, - { - .name = "notify_on_release", - .read_u64 = cgroup_read_notify_on_release, - .write_u64 = cgroup_write_notify_on_release, - }, - { - .name = "release_agent", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_release_agent_show, - .write = cgroup_release_agent_write, - .max_write_len = PATH_MAX - 1, - }, - { } /* terminate */ -}; - /* * css destruction is four-stage process. * @@ -5007,7 +3862,7 @@ static void css_free_work_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup_pidlist_destroy_all(cgrp); + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { @@ -5097,7 +3952,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, { lockdep_assert_held(&cgroup_mutex); - cgroup_get(cgrp); + cgroup_get_live(cgrp); memset(css, 0, sizeof(*css)); css->cgroup = cgrp; @@ -5273,7 +4128,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); - cgroup_get(parent); + cgroup_get_live(parent); /* * @cgrp is now fully operational. If something fails after this @@ -5302,8 +4157,7 @@ out_free_cgrp: return ERR_PTR(ret); } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; struct kernfs_node *kn; @@ -5507,7 +4361,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); - check_for_release(cgroup_parent(cgrp)); + cgroup1_check_for_release(cgroup_parent(cgrp)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -5515,7 +4369,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return 0; }; -static int cgroup_rmdir(struct kernfs_node *kn) +int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; int ret = 0; @@ -5535,10 +4389,8 @@ static int cgroup_rmdir(struct kernfs_node *kn) static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .remount_fs = cgroup_remount, - .show_options = cgroup_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, - .rename = cgroup_rename, .show_path = cgroup_show_path, }; @@ -5646,8 +4498,8 @@ int __init cgroup_init(void) BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); /* * The latency of the synchronize_sched() is too high for cgroups, @@ -5666,7 +4518,7 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); - BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); mutex_unlock(&cgroup_mutex); @@ -5697,7 +4549,7 @@ int __init cgroup_init(void) continue; } - if (cgroup_ssid_no_v1(ssid)) + if (cgroup1_ssid_disabled(ssid)) printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", ss->name); @@ -5744,15 +4596,6 @@ static int __init cgroup_wq_init(void) */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); - - /* - * Used to destroy pidlists and separate to serve as flush domain. - * Cap @max_active to 1 too. - */ - cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); - BUG_ON(!cgroup_pidlist_destroy_wq); - return 0; } core_initcall(cgroup_wq_init); @@ -5835,42 +4678,6 @@ out: return retval; } -/* Display information about each subsystem and each hierarchy */ -static int proc_cgroupstats_show(struct seq_file *m, void *v) -{ - struct cgroup_subsys *ss; - int i; - - seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); - /* - * ideally we don't want subsystems moving around while we do this. - * cgroup_mutex is also necessary to guarantee an atomic snapshot of - * subsys/hierarchy state. - */ - mutex_lock(&cgroup_mutex); - - for_each_subsys(ss, i) - seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->legacy_name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), - cgroup_ssid_enabled(i)); - - mutex_unlock(&cgroup_mutex); - return 0; -} - -static int cgroupstats_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_cgroupstats_show, NULL); -} - -static const struct file_operations proc_cgroupstats_operations = { - .open = cgroupstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -6050,76 +4857,6 @@ void cgroup_free(struct task_struct *task) put_css_set(cset); } -static void check_for_release(struct cgroup *cgrp) -{ - if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && - !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) - schedule_work(&cgrp->release_agent_work); -} - -/* - * Notify userspace when a cgroup is released, by running the - * configured release agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. - * - * Most likely, this user command will try to rmdir this cgroup. - * - * This races with the possibility that some other task will be - * attached to this cgroup before it is removed, or that some other - * user task will 'mkdir' a child cgroup of this cgroup. That's ok. - * The presumed 'rmdir' will fail quietly if this cgroup is no longer - * unused, and this cgroup will be reprieved from its death sentence, - * to continue to serve a useful existence. Next time it's released, - * we will get notified again, if it still has 'notify_on_release' set. - * - * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which - * means only wait until the task is successfully execve()'d. The - * separate release agent task is forked by call_usermodehelper(), - * then control in this thread returns here, without waiting for the - * release agent task. We don't bother to wait because the caller of - * this routine has no use for the exit status of the release agent - * task, so no sense holding our caller up for that. - */ -static void cgroup_release_agent(struct work_struct *work) -{ - struct cgroup *cgrp = - container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; - char *argv[3], *envp[3]; - int ret; - - mutex_lock(&cgroup_mutex); - - pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf) - goto out; - - spin_lock_irq(&css_set_lock); - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_irq(&css_set_lock); - if (ret < 0 || ret >= PATH_MAX) - goto out; - - argv[0] = agentbuf; - argv[1] = pathbuf; - argv[2] = NULL; - - /* minimal command environment */ - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; - - mutex_unlock(&cgroup_mutex); - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - goto out_free; -out: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(agentbuf); - kfree(pathbuf); -} - static int __init cgroup_disable(char *str) { struct cgroup_subsys *ss; @@ -6141,33 +4878,6 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); -static int __init cgroup_no_v1(char *str) -{ - struct cgroup_subsys *ss; - char *token; - int i; - - while ((token = strsep(&str, ",")) != NULL) { - if (!*token) - continue; - - if (!strcmp(token, "all")) { - cgroup_no_v1_mask = U16_MAX; - break; - } - - for_each_subsys(ss, i) { - if (strcmp(token, ss->name) && - strcmp(token, ss->legacy_name)) - continue; - - cgroup_no_v1_mask |= 1 << i; - } - } - return 1; -} -__setup("cgroup_no_v1=", cgroup_no_v1); - /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -6197,7 +4907,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details. */ - cgrp = rcu_dereference(kn->priv); + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); @@ -6242,7 +4952,7 @@ struct cgroup *cgroup_get_from_path(const char *path) if (kn) { if (kernfs_type(kn) == KERNFS_DIR) { cgrp = kn->priv; - cgroup_get(cgrp); + cgroup_get_live(cgrp); } else { cgrp = ERR_PTR(-ENOTDIR); } @@ -6322,6 +5032,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) /* Socket clone path */ if (skcd->val) { + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ cgroup_get(sock_cgroup_ptr(skcd)); return; } @@ -6349,154 +5064,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ -/* cgroup namespaces */ - -static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) -{ - return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); -} - -static void dec_cgroup_namespaces(struct ucounts *ucounts) -{ - dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); -} - -static struct cgroup_namespace *alloc_cgroup_ns(void) -{ - struct cgroup_namespace *new_ns; - int ret; - - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); - if (!new_ns) - return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - return ERR_PTR(ret); - } - atomic_set(&new_ns->count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; -} - -void free_cgroup_ns(struct cgroup_namespace *ns) -{ - put_css_set(ns->root_cset); - dec_cgroup_namespaces(ns->ucounts); - put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); -} -EXPORT_SYMBOL(free_cgroup_ns); - -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, - struct user_namespace *user_ns, - struct cgroup_namespace *old_ns) -{ - struct cgroup_namespace *new_ns; - struct ucounts *ucounts; - struct css_set *cset; - - BUG_ON(!old_ns); - - if (!(flags & CLONE_NEWCGROUP)) { - get_cgroup_ns(old_ns); - return old_ns; - } - - /* Allow only sysadmin to create cgroup namespace. */ - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - ucounts = inc_cgroup_namespaces(user_ns); - if (!ucounts) - return ERR_PTR(-ENOSPC); - - /* It is not safe to take cgroup_mutex here */ - spin_lock_irq(&css_set_lock); - cset = task_css_set(current); - get_css_set(cset); - spin_unlock_irq(&css_set_lock); - - new_ns = alloc_cgroup_ns(); - if (IS_ERR(new_ns)) { - put_css_set(cset); - dec_cgroup_namespaces(ucounts); - return new_ns; - } - - new_ns->user_ns = get_user_ns(user_ns); - new_ns->ucounts = ucounts; - new_ns->root_cset = cset; - - return new_ns; -} - -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - -static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) -{ - struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); - - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || - !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - - /* Don't need to do anything if we are attaching to our own cgroupns. */ - if (cgroup_ns == nsproxy->cgroup_ns) - return 0; - - get_cgroup_ns(cgroup_ns); - put_cgroup_ns(nsproxy->cgroup_ns); - nsproxy->cgroup_ns = cgroup_ns; - - return 0; -} - -static struct ns_common *cgroupns_get(struct task_struct *task) -{ - struct cgroup_namespace *ns = NULL; - struct nsproxy *nsproxy; - - task_lock(task); - nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->cgroup_ns; - get_cgroup_ns(ns); - } - task_unlock(task); - - return ns ? &ns->ns : NULL; -} - -static void cgroupns_put(struct ns_common *ns) -{ - put_cgroup_ns(to_cg_ns(ns)); -} - -static struct user_namespace *cgroupns_owner(struct ns_common *ns) -{ - return to_cg_ns(ns)->user_ns; -} - -const struct proc_ns_operations cgroupns_operations = { - .name = "cgroup", - .type = CLONE_NEWCGROUP, - .get = cgroupns_get, - .put = cgroupns_put, - .install = cgroupns_install, - .owner = cgroupns_owner, -}; - -static __init int cgroup_namespaces_init(void) -{ - return 0; -} -subsys_initcall(cgroup_namespaces_init); - #ifdef CONFIG_CGROUP_BPF int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, bool overridable) @@ -6510,149 +5077,3 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, return ret; } #endif /* CONFIG_CGROUP_BPF */ - -#ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state * -debug_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_css_free(struct cgroup_subsys_state *css) -{ - kfree(css); -} - -static u64 debug_taskcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return cgroup_task_count(css->cgroup); -} - -static u64 current_css_set_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return (u64)(unsigned long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(&task_css_set(current)->refcount); - rcu_read_unlock(); - return count; -} - -static int current_css_set_cg_links_read(struct seq_file *seq, void *v) -{ - struct cgrp_cset_link *link; - struct css_set *cset; - char *name_buf; - - name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name_buf) - return -ENOMEM; - - spin_lock_irq(&css_set_lock); - rcu_read_lock(); - cset = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - cgroup_name(c, name_buf, NAME_MAX + 1); - seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name_buf); - } - rcu_read_unlock(); - spin_unlock_irq(&css_set_lock); - kfree(name_buf); - return 0; -} - -#define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct seq_file *seq, void *v) -{ - struct cgroup_subsys_state *css = seq_css(seq); - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { - struct css_set *cset = link->cset; - struct task_struct *task; - int count = 0; - - seq_printf(seq, "css_set %p\n", cset); - - list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - - list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - continue; - overflow: - seq_puts(seq, " ...\n"); - } - spin_unlock_irq(&css_set_lock); - return 0; -} - -static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - return (!cgroup_is_populated(css->cgroup) && - !css_has_online_children(&css->cgroup->self)); -} - -static struct cftype debug_files[] = { - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "current_css_set_cg_links", - .seq_show = current_css_set_cg_links_read, - }, - - { - .name = "cgroup_css_links", - .seq_show = cgroup_css_links_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, - - { } /* terminate */ -}; - -struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_files, -}; -#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cpuset.c b/kernel/cgroup/cpuset.c index b3088886cd37..f6501f4f6040 100644 --- a/kernel/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -44,6 +44,8 @@ #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> #include <linux/seq_file.h> #include <linux/security.h> #include <linux/slab.h> @@ -2119,10 +2121,8 @@ int __init cpuset_init(void) { int err = 0; - if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) - BUG(); - if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) - BUG(); + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); @@ -2137,8 +2137,7 @@ int __init cpuset_init(void) if (err < 0) return err; - if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) - BUG(); + BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); return 0; } @@ -2352,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rebuild_sched_domains(); } -void cpuset_update_active_cpus(bool cpu_online) +void cpuset_update_active_cpus(void) { /* * We're inside cpu hotplug critical region which usually nests diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup/freezer.c index 1b72d56edce5..1b72d56edce5 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup/freezer.c diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c new file mode 100644 index 000000000000..66129eb4371d --- /dev/null +++ b/kernel/cgroup/namespace.c @@ -0,0 +1,155 @@ +#include "cgroup-internal.h" + +#include <linux/sched/task.h> +#include <linux/slab.h> +#include <linux/nsproxy.h> +#include <linux/proc_ns.h> + + +/* cgroup namespaces */ + +static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); +} + +static void dec_cgroup_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); +} + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + refcount_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + dec_cgroup_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns; + struct ucounts *ucounts; + struct css_set *cset; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + ucounts = inc_cgroup_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + + /* It is not safe to take cgroup_mutex here */ + spin_lock_irq(&css_set_lock); + cset = task_css_set(current); + get_css_set(cset); + spin_unlock_irq(&css_set_lock); + + new_ns = alloc_cgroup_ns(); + if (IS_ERR(new_ns)) { + put_css_set(cset); + dec_cgroup_namespaces(ucounts); + return new_ns; + } + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; + new_ns->root_cset = cset; + + return new_ns; +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +{ + struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); + + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Don't need to do anything if we are attaching to our own cgroupns. */ + if (cgroup_ns == nsproxy->cgroup_ns) + return 0; + + get_cgroup_ns(cgroup_ns); + put_cgroup_ns(nsproxy->cgroup_ns); + nsproxy->cgroup_ns = cgroup_ns; + + return 0; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +static struct user_namespace *cgroupns_owner(struct ns_common *ns) +{ + return to_cg_ns(ns)->user_ns; +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, + .owner = cgroupns_owner, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); diff --git a/kernel/cgroup_pids.c b/kernel/cgroup/pids.c index 2bd673783f1a..2237201d66d5 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup/pids.c @@ -214,7 +214,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) /* * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies - * on threadgroup_change_begin() held by the copy_process(). + * on cgroup_threadgroup_change_begin() held by the copy_process(). */ static int pids_can_fork(struct task_struct *task) { @@ -229,7 +229,7 @@ static int pids_can_fork(struct task_struct *task) /* Only log the first time events_limit is incremented. */ if (atomic64_inc_return(&pids->events_limit) == 1) { pr_info("cgroup: fork rejected by pids controller in "); - pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id)); + pr_cont_cgroup_path(css->cgroup); pr_cont("\n"); } cgroup_file_notify(&pids->events_file); diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c new file mode 100644 index 000000000000..defad3c5e7dc --- /dev/null +++ b/kernel/cgroup/rdma.c @@ -0,0 +1,619 @@ +/* + * RDMA resource limiting controller for cgroups. + * + * Used to allow a cgroup hierarchy to stop processes from consuming + * additional RDMA resources after a certain limit is reached. + * + * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/cgroup.h> +#include <linux/parser.h> +#include <linux/cgroup_rdma.h> + +#define RDMACG_MAX_STR "max" + +/* + * Protects list of resource pools maintained on per cgroup basis + * and rdma device list. + */ +static DEFINE_MUTEX(rdmacg_mutex); +static LIST_HEAD(rdmacg_devices); + +enum rdmacg_file_type { + RDMACG_RESOURCE_TYPE_MAX, + RDMACG_RESOURCE_TYPE_STAT, +}; + +/* + * resource table definition as to be seen by the user. + * Need to add entries to it when more resources are + * added/defined at IB verb/core layer. + */ +static char const *rdmacg_resource_names[] = { + [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", + [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", +}; + +/* resource tracker for each resource of rdma cgroup */ +struct rdmacg_resource { + int max; + int usage; +}; + +/* + * resource pool object which represents per cgroup, per device + * resources. There are multiple instances of this object per cgroup, + * therefore it cannot be embedded within rdma_cgroup structure. It + * is maintained as list. + */ +struct rdmacg_resource_pool { + struct rdmacg_device *device; + struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; + + struct list_head cg_node; + struct list_head dev_node; + + /* count active user tasks of this pool */ + u64 usage_sum; + /* total number counts which are set to max */ + int num_max_cnt; +}; + +static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) +{ + return container_of(css, struct rdma_cgroup, css); +} + +static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) +{ + return css_rdmacg(cg->css.parent); +} + +static inline struct rdma_cgroup *get_current_rdmacg(void) +{ + return css_rdmacg(task_get_css(current, rdma_cgrp_id)); +} + +static void set_resource_limit(struct rdmacg_resource_pool *rpool, + int index, int new_max) +{ + if (new_max == S32_MAX) { + if (rpool->resources[index].max != S32_MAX) + rpool->num_max_cnt++; + } else { + if (rpool->resources[index].max == S32_MAX) + rpool->num_max_cnt--; + } + rpool->resources[index].max = new_max; +} + +static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) +{ + int i; + + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) + set_resource_limit(rpool, i, S32_MAX); +} + +static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) +{ + lockdep_assert_held(&rdmacg_mutex); + + list_del(&rpool->cg_node); + list_del(&rpool->dev_node); + kfree(rpool); +} + +static struct rdmacg_resource_pool * +find_cg_rpool_locked(struct rdma_cgroup *cg, + struct rdmacg_device *device) + +{ + struct rdmacg_resource_pool *pool; + + lockdep_assert_held(&rdmacg_mutex); + + list_for_each_entry(pool, &cg->rpools, cg_node) + if (pool->device == device) + return pool; + + return NULL; +} + +static struct rdmacg_resource_pool * +get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) +{ + struct rdmacg_resource_pool *rpool; + + rpool = find_cg_rpool_locked(cg, device); + if (rpool) + return rpool; + + rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); + if (!rpool) + return ERR_PTR(-ENOMEM); + + rpool->device = device; + set_all_resource_max_limit(rpool); + + INIT_LIST_HEAD(&rpool->cg_node); + INIT_LIST_HEAD(&rpool->dev_node); + list_add_tail(&rpool->cg_node, &cg->rpools); + list_add_tail(&rpool->dev_node, &device->rpools); + return rpool; +} + +/** + * uncharge_cg_locked - uncharge resource for rdma cgroup + * @cg: pointer to cg to uncharge and all parents in hierarchy + * @device: pointer to rdmacg device + * @index: index of the resource to uncharge in cg (resource pool) + * + * It also frees the resource pool which was created as part of + * charging operation when there are no resources attached to + * resource pool. + */ +static void +uncharge_cg_locked(struct rdma_cgroup *cg, + struct rdmacg_device *device, + enum rdmacg_resource_type index) +{ + struct rdmacg_resource_pool *rpool; + + rpool = find_cg_rpool_locked(cg, device); + + /* + * rpool cannot be null at this stage. Let kernel operate in case + * if there a bug in IB stack or rdma controller, instead of crashing + * the system. + */ + if (unlikely(!rpool)) { + pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); + return; + } + + rpool->resources[index].usage--; + + /* + * A negative count (or overflow) is invalid, + * it indicates a bug in the rdma controller. + */ + WARN_ON_ONCE(rpool->resources[index].usage < 0); + rpool->usage_sum--; + if (rpool->usage_sum == 0 && + rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { + /* + * No user of the rpool and all entries are set to max, so + * safe to delete this rpool. + */ + free_cg_rpool_locked(rpool); + } +} + +/** + * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count + * @device: pointer to rdmacg device + * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup + * stop uncharging + * @index: index of the resource to uncharge in cg in given resource pool + */ +static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, + struct rdmacg_device *device, + struct rdma_cgroup *stop_cg, + enum rdmacg_resource_type index) +{ + struct rdma_cgroup *p; + + mutex_lock(&rdmacg_mutex); + + for (p = cg; p != stop_cg; p = parent_rdmacg(p)) + uncharge_cg_locked(p, device, index); + + mutex_unlock(&rdmacg_mutex); + + css_put(&cg->css); +} + +/** + * rdmacg_uncharge - hierarchically uncharge rdma resource count + * @device: pointer to rdmacg device + * @index: index of the resource to uncharge in cgroup in given resource pool + */ +void rdmacg_uncharge(struct rdma_cgroup *cg, + struct rdmacg_device *device, + enum rdmacg_resource_type index) +{ + if (index >= RDMACG_RESOURCE_MAX) + return; + + rdmacg_uncharge_hierarchy(cg, device, NULL, index); +} +EXPORT_SYMBOL(rdmacg_uncharge); + +/** + * rdmacg_try_charge - hierarchically try to charge the rdma resource + * @rdmacg: pointer to rdma cgroup which will own this resource + * @device: pointer to rdmacg device + * @index: index of the resource to charge in cgroup (resource pool) + * + * This function follows charging resource in hierarchical way. + * It will fail if the charge would cause the new value to exceed the + * hierarchical limit. + * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. + * Returns pointer to rdmacg for this resource when charging is successful. + * + * Charger needs to account resources on two criteria. + * (a) per cgroup & (b) per device resource usage. + * Per cgroup resource usage ensures that tasks of cgroup doesn't cross + * the configured limits. Per device provides granular configuration + * in multi device usage. It allocates resource pool in the hierarchy + * for each parent it come across for first resource. Later on resource + * pool will be available. Therefore it will be much faster thereon + * to charge/uncharge. + */ +int rdmacg_try_charge(struct rdma_cgroup **rdmacg, + struct rdmacg_device *device, + enum rdmacg_resource_type index) +{ + struct rdma_cgroup *cg, *p; + struct rdmacg_resource_pool *rpool; + s64 new; + int ret = 0; + + if (index >= RDMACG_RESOURCE_MAX) + return -EINVAL; + + /* + * hold on to css, as cgroup can be removed but resource + * accounting happens on css. + */ + cg = get_current_rdmacg(); + + mutex_lock(&rdmacg_mutex); + for (p = cg; p; p = parent_rdmacg(p)) { + rpool = get_cg_rpool_locked(p, device); + if (IS_ERR(rpool)) { + ret = PTR_ERR(rpool); + goto err; + } else { + new = rpool->resources[index].usage + 1; + if (new > rpool->resources[index].max) { + ret = -EAGAIN; + goto err; + } else { + rpool->resources[index].usage = new; + rpool->usage_sum++; + } + } + } + mutex_unlock(&rdmacg_mutex); + + *rdmacg = cg; + return 0; + +err: + mutex_unlock(&rdmacg_mutex); + rdmacg_uncharge_hierarchy(cg, device, p, index); + return ret; +} +EXPORT_SYMBOL(rdmacg_try_charge); + +/** + * rdmacg_register_device - register rdmacg device to rdma controller. + * @device: pointer to rdmacg device whose resources need to be accounted. + * + * If IB stack wish a device to participate in rdma cgroup resource + * tracking, it must invoke this API to register with rdma cgroup before + * any user space application can start using the RDMA resources. + * Returns 0 on success or EINVAL when table length given is beyond + * supported size. + */ +int rdmacg_register_device(struct rdmacg_device *device) +{ + INIT_LIST_HEAD(&device->dev_node); + INIT_LIST_HEAD(&device->rpools); + + mutex_lock(&rdmacg_mutex); + list_add_tail(&device->dev_node, &rdmacg_devices); + mutex_unlock(&rdmacg_mutex); + return 0; +} +EXPORT_SYMBOL(rdmacg_register_device); + +/** + * rdmacg_unregister_device - unregister rdmacg device from rdma controller. + * @device: pointer to rdmacg device which was previously registered with rdma + * controller using rdmacg_register_device(). + * + * IB stack must invoke this after all the resources of the IB device + * are destroyed and after ensuring that no more resources will be created + * when this API is invoked. + */ +void rdmacg_unregister_device(struct rdmacg_device *device) +{ + struct rdmacg_resource_pool *rpool, *tmp; + + /* + * Synchronize with any active resource settings, + * usage query happening via configfs. + */ + mutex_lock(&rdmacg_mutex); + list_del_init(&device->dev_node); + + /* + * Now that this device is off the cgroup list, its safe to free + * all the rpool resources. + */ + list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) + free_cg_rpool_locked(rpool); + + mutex_unlock(&rdmacg_mutex); +} +EXPORT_SYMBOL(rdmacg_unregister_device); + +static int parse_resource(char *c, int *intval) +{ + substring_t argstr; + const char **table = &rdmacg_resource_names[0]; + char *name, *value = c; + size_t len; + int ret, i = 0; + + name = strsep(&value, "="); + if (!name || !value) + return -EINVAL; + + len = strlen(value); + + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { + if (strcmp(table[i], name)) + continue; + + argstr.from = value; + argstr.to = value + len; + + ret = match_int(&argstr, intval); + if (ret >= 0) { + if (*intval < 0) + break; + return i; + } + if (strncmp(value, RDMACG_MAX_STR, len) == 0) { + *intval = S32_MAX; + return i; + } + break; + } + return -EINVAL; +} + +static int rdmacg_parse_limits(char *options, + int *new_limits, unsigned long *enables) +{ + char *c; + int err = -EINVAL; + + /* parse resource options */ + while ((c = strsep(&options, " ")) != NULL) { + int index, intval; + + index = parse_resource(c, &intval); + if (index < 0) + goto err; + + new_limits[index] = intval; + *enables |= BIT(index); + } + return 0; + +err: + return err; +} + +static struct rdmacg_device *rdmacg_get_device_locked(const char *name) +{ + struct rdmacg_device *device; + + lockdep_assert_held(&rdmacg_mutex); + + list_for_each_entry(device, &rdmacg_devices, dev_node) + if (!strcmp(name, device->name)) + return device; + + return NULL; +} + +static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdma_cgroup *cg = css_rdmacg(of_css(of)); + const char *dev_name; + struct rdmacg_resource_pool *rpool; + struct rdmacg_device *device; + char *options = strstrip(buf); + int *new_limits; + unsigned long enables = 0; + int i = 0, ret = 0; + + /* extract the device name first */ + dev_name = strsep(&options, " "); + if (!dev_name) { + ret = -EINVAL; + goto err; + } + + new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); + if (!new_limits) { + ret = -ENOMEM; + goto err; + } + + ret = rdmacg_parse_limits(options, new_limits, &enables); + if (ret) + goto parse_err; + + /* acquire lock to synchronize with hot plug devices */ + mutex_lock(&rdmacg_mutex); + + device = rdmacg_get_device_locked(dev_name); + if (!device) { + ret = -ENODEV; + goto dev_err; + } + + rpool = get_cg_rpool_locked(cg, device); + if (IS_ERR(rpool)) { + ret = PTR_ERR(rpool); + goto dev_err; + } + + /* now set the new limits of the rpool */ + for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) + set_resource_limit(rpool, i, new_limits[i]); + + if (rpool->usage_sum == 0 && + rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { + /* + * No user of the rpool and all entries are set to max, so + * safe to delete this rpool. + */ + free_cg_rpool_locked(rpool); + } + +dev_err: + mutex_unlock(&rdmacg_mutex); + +parse_err: + kfree(new_limits); + +err: + return ret ?: nbytes; +} + +static void print_rpool_values(struct seq_file *sf, + struct rdmacg_resource_pool *rpool) +{ + enum rdmacg_file_type sf_type; + int i; + u32 value; + + sf_type = seq_cft(sf)->private; + + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { + seq_puts(sf, rdmacg_resource_names[i]); + seq_putc(sf, '='); + if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { + if (rpool) + value = rpool->resources[i].max; + else + value = S32_MAX; + } else { + if (rpool) + value = rpool->resources[i].usage; + else + value = 0; + } + + if (value == S32_MAX) + seq_puts(sf, RDMACG_MAX_STR); + else + seq_printf(sf, "%d", value); + seq_putc(sf, ' '); + } +} + +static int rdmacg_resource_read(struct seq_file *sf, void *v) +{ + struct rdmacg_device *device; + struct rdmacg_resource_pool *rpool; + struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); + + mutex_lock(&rdmacg_mutex); + + list_for_each_entry(device, &rdmacg_devices, dev_node) { + seq_printf(sf, "%s ", device->name); + + rpool = find_cg_rpool_locked(cg, device); + print_rpool_values(sf, rpool); + + seq_putc(sf, '\n'); + } + + mutex_unlock(&rdmacg_mutex); + return 0; +} + +static struct cftype rdmacg_files[] = { + { + .name = "max", + .write = rdmacg_resource_set_max, + .seq_show = rdmacg_resource_read, + .private = RDMACG_RESOURCE_TYPE_MAX, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .seq_show = rdmacg_resource_read, + .private = RDMACG_RESOURCE_TYPE_STAT, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* terminate */ +}; + +static struct cgroup_subsys_state * +rdmacg_css_alloc(struct cgroup_subsys_state *parent) +{ + struct rdma_cgroup *cg; + + cg = kzalloc(sizeof(*cg), GFP_KERNEL); + if (!cg) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&cg->rpools); + return &cg->css; +} + +static void rdmacg_css_free(struct cgroup_subsys_state *css) +{ + struct rdma_cgroup *cg = css_rdmacg(css); + + kfree(cg); +} + +/** + * rdmacg_css_offline - cgroup css_offline callback + * @css: css of interest + * + * This function is called when @css is about to go away and responsible + * for shooting down all rdmacg associated with @css. As part of that it + * marks all the resource pool entries to max value, so that when resources are + * uncharged, associated resource pool can be freed as well. + */ +static void rdmacg_css_offline(struct cgroup_subsys_state *css) +{ + struct rdma_cgroup *cg = css_rdmacg(css); + struct rdmacg_resource_pool *rpool; + + mutex_lock(&rdmacg_mutex); + + list_for_each_entry(rpool, &cg->rpools, cg_node) + set_all_resource_max_limit(rpool); + + mutex_unlock(&rdmacg_mutex); +} + +struct cgroup_subsys rdma_cgrp_subsys = { + .css_alloc = rdmacg_css_alloc, + .css_free = rdmacg_css_free, + .css_offline = rdmacg_css_offline, + .legacy_cftypes = rdmacg_files, + .dfl_cftypes = rdmacg_files, +}; diff --git a/kernel/compat.c b/kernel/compat.c index 19aec5d98108..933bcb31ae10 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -108,8 +108,8 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, struct timezone __user *, tz) { + struct timespec64 new_ts; struct timeval user_tv; - struct timespec new_ts; struct timezone new_tz; if (tv) { @@ -123,7 +123,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, return -EFAULT; } - return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) @@ -240,18 +240,20 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) { struct timespec tu, rmt; + struct timespec64 tu64; mm_segment_t oldfs; long ret; if (compat_get_timespec(&tu, rqtp)) return -EFAULT; - if (!timespec_valid(&tu)) + tu64 = timespec_to_timespec64(tu); + if (!timespec64_valid(&tu64)) return -EINVAL; oldfs = get_fs(); set_fs(KERNEL_DS); - ret = hrtimer_nanosleep(&tu, + ret = hrtimer_nanosleep(&tu64, rmtp ? (struct timespec __user *)&rmt : NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC); set_fs(oldfs); diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 1a8f34f63601..26a06e09a5bd 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -21,6 +21,7 @@ CONFIG_CP15_BARRIER_EMULATION=y CONFIG_DEFAULT_SECURITY_SELINUX=y CONFIG_EMBEDDED=y CONFIG_FB=y +CONFIG_HARDENED_USERCOPY=y CONFIG_HIGH_RES_TIMERS=y CONFIG_INET6_AH=y CONFIG_INET6_ESP=y @@ -129,6 +130,7 @@ CONFIG_PPP_DEFLATE=y CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y CONFIG_QUOTA=y +CONFIG_RANDOMIZE_BASE=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECCOMP=y diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 297756be369c..28ee064b6744 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -1,4 +1,5 @@ # KEEP ALPHABETICALLY SORTED +# CONFIG_AIO is not set # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_LEGACY_PTYS is not set @@ -11,7 +12,7 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_COMPACTION=y -CONFIG_DEBUG_RODATA=y +CONFIG_STRICT_KERNEL_RWX=y CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 0a5f630f5c54..9ae6fbe5b5cf 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -7,7 +7,9 @@ #include <linux/smp.h> #include <linux/init.h> #include <linux/notifier.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/task.h> #include <linux/unistd.h> #include <linux/cpu.h> #include <linux/oom.h> @@ -1123,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ +int __boot_cpu_id; + #endif /* CONFIG_SMP */ /* Boot processor state steps */ @@ -1333,26 +1337,21 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name, struct cpuhp_step *sp; int ret = 0; - mutex_lock(&cpuhp_state_mutex); - if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) { ret = cpuhp_reserve_state(state); if (ret < 0) - goto out; + return ret; state = ret; } sp = cpuhp_get_step(state); - if (name && sp->name) { - ret = -EBUSY; - goto out; - } + if (name && sp->name) + return -EBUSY; + sp->startup.single = startup; sp->teardown.single = teardown; sp->name = name; sp->multi_instance = multi_instance; INIT_HLIST_HEAD(&sp->list); -out: - mutex_unlock(&cpuhp_state_mutex); return ret; } @@ -1426,6 +1425,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, return -EINVAL; get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) goto add_node; @@ -1445,16 +1445,14 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, if (ret) { if (sp->teardown.multi) cpuhp_rollback_install(cpu, state, node); - goto err; + goto unlock; } } add_node: ret = 0; - mutex_lock(&cpuhp_state_mutex); hlist_add_head(node, &sp->list); +unlock: mutex_unlock(&cpuhp_state_mutex); - -err: put_online_cpus(); return ret; } @@ -1489,6 +1487,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, return -EINVAL; get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); @@ -1522,6 +1521,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, } } out: + mutex_unlock(&cpuhp_state_mutex); put_online_cpus(); /* * If the requested state is CPUHP_AP_ONLINE_DYN, return the @@ -1545,6 +1545,8 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, return -EINVAL; get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); + if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; /* @@ -1561,7 +1563,6 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, } remove: - mutex_lock(&cpuhp_state_mutex); hlist_del(node); mutex_unlock(&cpuhp_state_mutex); put_online_cpus(); @@ -1569,6 +1570,7 @@ remove: return 0; } EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); + /** * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state * @state: The state to remove @@ -1587,6 +1589,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { WARN(!hlist_empty(&sp->list), "Error: Removing state %d which has instances left.\n", @@ -1611,6 +1614,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) } remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); + mutex_unlock(&cpuhp_state_mutex); put_online_cpus(); } EXPORT_SYMBOL(__cpuhp_remove_state); @@ -1813,6 +1817,10 @@ void __init boot_cpu_init(void) set_cpu_active(cpu, true); set_cpu_present(cpu, true); set_cpu_possible(cpu, true); + +#ifdef CONFIG_SMP + __boot_cpu_id = cpu; +#endif } /* diff --git a/kernel/crash_core.c b/kernel/crash_core.c new file mode 100644 index 000000000000..fcbd568f1e95 --- /dev/null +++ b/kernel/crash_core.c @@ -0,0 +1,439 @@ +/* + * crash.c - kernel crash support code. + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/crash_core.h> +#include <linux/utsname.h> +#include <linux/vmalloc.h> + +#include <asm/page.h> +#include <asm/sections.h> + +/* vmcoreinfo stuff */ +static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; +u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +size_t vmcoreinfo_size; +size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warn("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warn("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warn("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= system_ram) { + pr_warn("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (system_ram >= start && system_ram < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected after '@'\n"); + return -EINVAL; + } + } + } + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + +/* + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *name, + const char *suffix) +{ + char *first_colon, *first_space; + char *ck_cmdline; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); + + if (!ck_cmdline) + return -EINVAL; + + ck_cmdline += strlen(name); + + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + suffix); + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", NULL); +} + +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_LOW]); +} + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len) +{ + struct elf_note *note = (struct elf_note *)buf; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = data_len; + note->n_type = type; + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); + memcpy(buf, name, note->n_namesz); + buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); + + return buf; +} + +void final_note(Elf_Word *buf) +{ + memset(buf, 0, sizeof(struct elf_note)); +} + +static void update_vmcoreinfo_note(void) +{ + u32 *buf = vmcoreinfo_note; + + if (!vmcoreinfo_size) + return; + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + final_note(buf); +} + +void crash_save_vmcoreinfo(void) +{ + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); + update_vmcoreinfo_note(); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + size_t r; + + va_start(args, fmt); + r = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __weak arch_crash_save_vmcoreinfo(void) +{} + +phys_addr_t __weak paddr_vmcoreinfo_note(void) +{ + return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); +} + +static int __init crash_save_vmcoreinfo_init(void) +{ + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU + VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif + VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmap_area_list); + +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _refcount); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(page, compound_dtor); + VMCOREINFO_OFFSET(page, compound_order); + VMCOREINFO_OFFSET(page, compound_head); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + log_buf_vmcoreinfo_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PG_head_mask); + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_HUGETLB_PAGE + VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#endif + + arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); + + return 0; +} + +subsys_initcall(crash_save_vmcoreinfo_init); diff --git a/kernel/cred.c b/kernel/cred.c index 5f264fb5737d..2bc66075740f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -12,6 +12,7 @@ #include <linux/cred.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/coredump.h> #include <linux/key.h> #include <linux/keyctl.h> #include <linux/init_task.h> diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 79517e5549f1..65c0f1363788 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -49,6 +49,7 @@ #include <linux/init.h> #include <linux/kgdb.h> #include <linux/kdb.h> +#include <linux/nmi.h> #include <linux/pid.h> #include <linux/smp.h> #include <linux/mm.h> @@ -232,9 +233,9 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) int i; for (i = 0; i < VMACACHE_SIZE; i++) { - if (!current->vmacache[i]) + if (!current->vmacache.vmas[i]) continue; - flush_cache_range(current->vmacache[i], + flush_cache_range(current->vmacache.vmas[i], addr, addr + BREAK_INSTR_SIZE); } } diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 19d9a578c753..7510dc687c0d 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -29,6 +29,7 @@ */ #include <linux/kernel.h> +#include <linux/sched/signal.h> #include <linux/kgdb.h> #include <linux/kdb.h> #include <linux/serial_core.h> diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index fe15fff5df53..6ad4a9fcbd6f 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -12,7 +12,8 @@ #include <linux/ctype.h> #include <linux/string.h> #include <linux/kernel.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> #include <linux/kdb.h> #include <linux/nmi.h> #include "kdb_private.h" diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index ca183919d302..c8146d53ca67 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -18,6 +18,9 @@ #include <linux/kmsg_dump.h> #include <linux/reboot.h> #include <linux/sched.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/stat.h> +#include <linux/sched/debug.h> #include <linux/sysrq.h> #include <linux/smp.h> #include <linux/utsname.h> diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 435c14a45118..4a1c33416b6a 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -14,6 +14,8 @@ */ #include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/cputime.h> #include <linux/slab.h> #include <linux/taskstats.h> #include <linux/time.h> @@ -82,19 +84,19 @@ void __delayacct_blkio_end(void) int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { - cputime_t utime, stime, stimescaled, utimescaled; + u64 utime, stime, stimescaled, utimescaled; unsigned long long t2, t3; unsigned long flags, t1; s64 tmp; task_cputime(tsk, &utime, &stime); tmp = (s64)d->cpu_run_real_total; - tmp += cputime_to_nsecs(utime + stime); + tmp += utime + stime; d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; task_cputime_scaled(tsk, &utimescaled, &stimescaled); tmp = (s64)d->cpu_scaled_run_real_total; - tmp += cputime_to_nsecs(utimescaled + stimescaled); + tmp += utimescaled + stimescaled; d->cpu_scaled_run_real_total = (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index e9fdb5203de5..1b2be63c8528 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -11,6 +11,8 @@ #include <linux/perf_event.h> #include <linux/slab.h> +#include <linux/sched/task_stack.h> + #include "internal.h" struct callchain_cpus_entries { @@ -227,12 +229,18 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, } if (regs) { + mm_segment_t fs; + if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + + fs = get_fs(); + set_fs(USER_DS); perf_callchain_user(&ctx, regs); + set_fs(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index e235bb991bdd..6e75a5c9412d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -46,6 +46,10 @@ #include <linux/filter.h> #include <linux/namei.h> #include <linux/parser.h> +#include <linux/sched/clock.h> +#include <linux/sched/mm.h> +#include <linux/proc_ns.h> +#include <linux/mount.h> #include "internal.h" @@ -355,6 +359,8 @@ enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_TIME = 0x4, + /* see ctx_resched() for details */ + EVENT_CPU = 0x8, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -375,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; @@ -453,7 +460,7 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec(table, write, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -678,6 +685,8 @@ perf_cgroup_set_timestamp(struct task_struct *task, info->timestamp = ctx->timestamp; } +static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); + #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ @@ -690,61 +699,46 @@ perf_cgroup_set_timestamp(struct task_struct *task, static void perf_cgroup_switch(struct task_struct *task, int mode) { struct perf_cpu_context *cpuctx; - struct pmu *pmu; + struct list_head *list; unsigned long flags; /* - * disable interrupts to avoid geting nr_cgroup - * changes via __perf_event_disable(). Also - * avoids preemption. + * Disable interrupts and preemption to avoid this CPU's + * cgrp_cpuctx_entry to change under us. */ local_irq_save(flags); - /* - * we reschedule only in the presence of cgroup - * constrained events. - */ - - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - continue; /* ensure we process each cpuctx once */ + list = this_cpu_ptr(&cgrp_cpuctx_list); + list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - /* - * perf_cgroup_events says at least one - * context on this CPU has cgroup events. - * - * ctx->nr_cgroups reports the number of cgroup - * events for a context. - */ - if (cpuctx->ctx.nr_cgroups > 0) { - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(cpuctx->ctx.pmu); - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* - * set cgrp before ctxsw in to allow - * event_filter_match() to not have to pass - * task around - * we pass the cpuctx->ctx to perf_cgroup_from_task() - * because cgorup events are only per-cpu - */ - cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); + /* + * set cgrp before ctxsw in to allow + * event_filter_match() to not have to pass + * task around + * we pass the cpuctx->ctx to perf_cgroup_from_task() + * because cgorup events are only per-cpu + */ + cpuctx->cgrp = perf_cgroup_from_task(task, + &cpuctx->ctx); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } local_irq_restore(flags); @@ -889,6 +883,7 @@ list_update_cgroup_event(struct perf_event *event, struct perf_event_context *ctx, bool add) { struct perf_cpu_context *cpuctx; + struct list_head *cpuctx_entry; if (!is_cgroup_event(event)) return; @@ -902,15 +897,16 @@ list_update_cgroup_event(struct perf_event *event, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); - - /* - * cpuctx->cgrp is NULL until a cgroup event is sched in or - * ctx->nr_cgroup == 0 . - */ - if (add && perf_cgroup_from_task(current, ctx) == event->cgrp) - cpuctx->cgrp = event->cgrp; - else if (!add) + cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; + /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ + if (add) { + list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); + if (perf_cgroup_from_task(current, ctx) == event->cgrp) + cpuctx->cgrp = event->cgrp; + } else { + list_del(cpuctx_entry); cpuctx->cgrp = NULL; + } } #else /* !CONFIG_CGROUP_PERF */ @@ -1005,7 +1001,7 @@ list_update_cgroup_event(struct perf_event *event, */ #define PERF_CPU_HRTIMER (1000 / HZ) /* - * function must be called with interrupts disbled + * function must be called with interrupts disabled */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { @@ -1453,6 +1449,20 @@ static void update_group_times(struct perf_event *leader) update_event_times(event); } +static enum event_type_t get_event_type(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + enum event_type_t event_type; + + lockdep_assert_held(&ctx->lock); + + event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; + if (!ctx->task) + event_type |= EVENT_CPU; + + return event_type; +} + static struct list_head * ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) { @@ -2226,7 +2236,8 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task); static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) + struct perf_event_context *ctx, + enum event_type_t event_type) { if (!cpuctx->task_ctx) return; @@ -2234,7 +2245,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, EVENT_ALL); + ctx_sched_out(ctx, cpuctx, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, @@ -2249,13 +2260,51 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); } +/* + * We want to maintain the following priority of scheduling: + * - CPU pinned (EVENT_CPU | EVENT_PINNED) + * - task pinned (EVENT_PINNED) + * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) + * - task flexible (EVENT_FLEXIBLE). + * + * In order to avoid unscheduling and scheduling back in everything every + * time an event is added, only do it for the groups of equal priority and + * below. + * + * This can be called after a batch operation on task events, in which case + * event_type is a bit mask of the types of events involved. For CPU events, + * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. + */ static void ctx_resched(struct perf_cpu_context *cpuctx, - struct perf_event_context *task_ctx) + struct perf_event_context *task_ctx, + enum event_type_t event_type) { + enum event_type_t ctx_event_type = event_type & EVENT_ALL; + bool cpu_event = !!(event_type & EVENT_CPU); + + /* + * If pinned groups are involved, flexible groups also need to be + * scheduled out. + */ + if (event_type & EVENT_PINNED) + event_type |= EVENT_FLEXIBLE; + perf_pmu_disable(cpuctx->ctx.pmu); if (task_ctx) - task_ctx_sched_out(cpuctx, task_ctx); - cpu_ctx_sched_out(cpuctx, EVENT_ALL); + task_ctx_sched_out(cpuctx, task_ctx, event_type); + + /* + * Decide which cpu ctx groups to schedule out based on the types + * of events that caused rescheduling: + * - EVENT_CPU: schedule out corresponding groups; + * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; + * - otherwise, do nothing more. + */ + if (cpu_event) + cpu_ctx_sched_out(cpuctx, ctx_event_type); + else if (ctx_event_type & EVENT_PINNED) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + perf_event_sched_in(cpuctx, task_ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); } @@ -2302,7 +2351,7 @@ static int __perf_install_in_context(void *info) if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@ -2469,7 +2518,7 @@ static void __perf_event_enable(struct perf_event *event, if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); - ctx_resched(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, get_event_type(event)); } /* @@ -2896,7 +2945,7 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - task_ctx_sched_out(cpuctx, ctx); + task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); raw_spin_unlock(&ctx->lock); } } @@ -2943,7 +2992,7 @@ static void perf_pmu_sched_task(struct task_struct *prev, return; list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ + pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) continue; @@ -3133,8 +3182,12 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, * cpu flexible, task flexible. + * + * However, if task's ctx is not carrying any pinned + * events, no need to flip the cpuctx's events around. */ - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (!list_empty(&ctx->pinned_groups)) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); @@ -3449,6 +3502,7 @@ static int event_enable_on_exec(struct perf_event *event, static void perf_event_enable_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; + enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; @@ -3462,15 +3516,19 @@ static void perf_event_enable_on_exec(int ctxn) cpuctx = __get_cpu_context(ctx); perf_ctx_lock(cpuctx, ctx); ctx_sched_out(ctx, cpuctx, EVENT_TIME); - list_for_each_entry(event, &ctx->event_list, event_entry) + list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); + event_type |= get_event_type(event); + } /* * Unclone and reschedule this context if we enabled any event. */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx); + ctx_resched(cpuctx, ctx, event_type); + } else { + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); } perf_ctx_unlock(cpuctx, ctx); @@ -3936,6 +3994,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_mmap_events); if (event->attr.comm) atomic_dec(&nr_comm_events); + if (event->attr.namespaces) + atomic_dec(&nr_namespaces_events); if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) @@ -4201,7 +4261,7 @@ int perf_event_release_kernel(struct perf_event *event) raw_spin_lock_irq(&ctx->lock); /* - * Mark this even as STATE_DEAD, there is no external reference to it + * Mark this event as STATE_DEAD, there is no external reference to it * anymore. * * Anybody acquiring event->child_mutex after the below loop _must_ @@ -4874,9 +4934,9 @@ unlock: rcu_read_unlock(); } -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int perf_mmap_fault(struct vm_fault *vmf) { - struct perf_event *event = vma->vm_file->private_data; + struct perf_event *event = vmf->vma->vm_file->private_data; struct ring_buffer *rb; int ret = VM_FAULT_SIGBUS; @@ -4899,7 +4959,7 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) goto unlock; get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->mapping = vmf->vma->vm_file->f_mapping; vmf->page->index = vmf->pgoff; ret = 0; @@ -6436,6 +6496,7 @@ static void perf_event_task(struct task_struct *task, void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); + perf_event_namespaces(task); } /* @@ -6538,6 +6599,132 @@ void perf_event_comm(struct task_struct *task, bool exec) } /* + * namespaces tracking + */ + +struct perf_namespaces_event { + struct task_struct *task; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 nr_namespaces; + struct perf_ns_link_info link_info[NR_NAMESPACES]; + } event_id; +}; + +static int perf_event_namespaces_match(struct perf_event *event) +{ + return event->attr.namespaces; +} + +static void perf_event_namespaces_output(struct perf_event *event, + void *data) +{ + struct perf_namespaces_event *namespaces_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_namespaces_match(event)) + return; + + perf_event_header__init_id(&namespaces_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + namespaces_event->event_id.header.size); + if (ret) + return; + + namespaces_event->event_id.pid = perf_event_pid(event, + namespaces_event->task); + namespaces_event->event_id.tid = perf_event_tid(event, + namespaces_event->task); + + perf_output_put(&handle, namespaces_event->event_id); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, + struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct path ns_path; + struct inode *ns_inode; + void *error; + + error = ns_get_path(&ns_path, task, ns_ops); + if (!error) { + ns_inode = ns_path.dentry->d_inode; + ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); + ns_link_info->ino = ns_inode->i_ino; + } +} + +void perf_event_namespaces(struct task_struct *task) +{ + struct perf_namespaces_event namespaces_event; + struct perf_ns_link_info *ns_link_info; + + if (!atomic_read(&nr_namespaces_events)) + return; + + namespaces_event = (struct perf_namespaces_event){ + .task = task, + .event_id = { + .header = { + .type = PERF_RECORD_NAMESPACES, + .misc = 0, + .size = sizeof(namespaces_event.event_id), + }, + /* .pid */ + /* .tid */ + .nr_namespaces = NR_NAMESPACES, + /* .link_info[NR_NAMESPACES] */ + }, + }; + + ns_link_info = namespaces_event.event_id.link_info; + + perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX], + task, &mntns_operations); + +#ifdef CONFIG_USER_NS + perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX], + task, &userns_operations); +#endif +#ifdef CONFIG_NET_NS + perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX], + task, &netns_operations); +#endif +#ifdef CONFIG_UTS_NS + perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX], + task, &utsns_operations); +#endif +#ifdef CONFIG_IPC_NS + perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX], + task, &ipcns_operations); +#endif +#ifdef CONFIG_PID_NS + perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX], + task, &pidns_operations); +#endif +#ifdef CONFIG_CGROUPS + perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX], + task, &cgroupns_operations); +#endif + + perf_iterate_sb(perf_event_namespaces_output, + &namespaces_event, + NULL); +} + +/* * mmap tracking */ @@ -8044,6 +8231,9 @@ static void perf_event_addr_filters_apply(struct perf_event *event) if (task == TASK_TOMBSTONE) return; + if (!ifh->nr_file_filters) + return; + mm = get_task_mm(event->ctx->task); if (!mm) goto restart; @@ -8214,6 +8404,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, * attribute. */ if (state == IF_STATE_END) { + ret = -EINVAL; if (kernel && event->attr.exclude_kernel) goto fail; @@ -8221,6 +8412,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (!filename) goto fail; + /* + * For now, we only support file-based filters + * in per-task events; doing so for CPU-wide + * events requires additional context switching + * trickery, since same object code will be + * mapped at different virtual addresses in + * different processes. + */ + ret = -EOPNOTSUPP; + if (!event->ctx->task) + goto fail_free_name; + /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) @@ -8236,6 +8439,8 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, !S_ISREG(filter->inode->i_mode)) /* free_filters_list() will iput() */ goto fail; + + event->addr_filters.nr_file_filters++; } /* ready to consume more filters */ @@ -8275,24 +8480,13 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str) if (WARN_ON_ONCE(event->parent)) return -EINVAL; - /* - * For now, we only support filtering in per-task events; doing so - * for CPU-wide events requires additional context switching trickery, - * since same object code will be mapped at different virtual - * addresses in different processes. - */ - if (!event->ctx->task) - return -EOPNOTSUPP; - ret = perf_event_parse_addr_filter(event, filter_str, &filters); if (ret) - return ret; + goto fail_clear_files; ret = event->pmu->addr_filters_validate(&filters); - if (ret) { - free_filters_list(&filters); - return ret; - } + if (ret) + goto fail_free_filters; /* remove existing filters, if any */ perf_addr_filters_splice(event, &filters); @@ -8301,6 +8495,14 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str) perf_event_for_each_child(event, perf_event_addr_filters_apply); return ret; + +fail_free_filters: + free_filters_list(&filters); + +fail_clear_files: + event->addr_filters.nr_file_filters = 0; + + return ret; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -8652,37 +8854,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) return NULL; } -static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - - if (cpuctx->unique_pmu == old_pmu) - cpuctx->unique_pmu = pmu; - } -} - static void free_pmu_context(struct pmu *pmu) { - struct pmu *i; - mutex_lock(&pmus_lock); - /* - * Like a real lame refcount. - */ - list_for_each_entry(i, &pmus, entry) { - if (i->pmu_cpu_context == pmu->pmu_cpu_context) { - update_pmu_context(i, pmu); - goto out; - } - } - free_percpu(pmu->pmu_cpu_context); -out: mutex_unlock(&pmus_lock); } @@ -8886,8 +9061,6 @@ skip_type: cpuctx->ctx.pmu = pmu; __perf_mux_hrtimer_init(cpuctx, cpu); - - cpuctx->unique_pmu = pmu; } got_cpu_context: @@ -9005,6 +9178,14 @@ static struct pmu *perf_init_event(struct perf_event *event) idx = srcu_read_lock(&pmus_srcu); + /* Try parent's PMU first: */ + if (event->parent && event->parent->pmu) { + pmu = event->parent->pmu; + ret = perf_try_init_event(pmu, event); + if (!ret) + goto unlock; + } + rcu_read_lock(); pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); @@ -9097,6 +9278,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); + if (event->attr.namespaces) + atomic_inc(&nr_namespaces_events); if (event->attr.task) atomic_inc(&nr_task_events); if (event->attr.freq) @@ -9642,6 +9825,11 @@ SYSCALL_DEFINE5(perf_event_open, return -EACCES; } + if (attr.namespaces) { + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + } + if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; @@ -9910,6 +10098,7 @@ SYSCALL_DEFINE5(perf_event_open, * of swizzling perf_event::ctx. */ perf_remove_from_context(group_leader, 0); + put_ctx(gctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { @@ -9948,13 +10137,6 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); - - /* - * Now that all events are installed in @ctx, nothing - * references @gctx anymore, so drop the last reference we have - * on it. - */ - put_ctx(gctx); } /* @@ -10265,7 +10447,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation @@ -10374,21 +10556,22 @@ void perf_event_free_task(struct task_struct *task) continue; mutex_lock(&ctx->mutex); -again: - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, - group_entry) - perf_free_event(event, ctx); + raw_spin_lock_irq(&ctx->lock); + /* + * Destroy the task <-> ctx relation and mark the context dead. + * + * This is important because even though the task hasn't been + * exposed yet the context has been (through child_list). + */ + RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL); + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ + raw_spin_unlock_irq(&ctx->lock); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, - group_entry) + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) perf_free_event(event, ctx); - if (!list_empty(&ctx->pinned_groups) || - !list_empty(&ctx->flexible_groups)) - goto again; - mutex_unlock(&ctx->mutex); - put_ctx(ctx); } } @@ -10426,7 +10609,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) } /* - * inherit a event from parent task to child task: + * Inherit a event from parent task to child task. + * + * Returns: + * - valid pointer on success + * - NULL for orphaned events + * - IS_ERR() on error */ static struct perf_event * inherit_event(struct perf_event *parent_event, @@ -10520,6 +10708,16 @@ inherit_event(struct perf_event *parent_event, return child_event; } +/* + * Inherits an event group. + * + * This will quietly suppress orphaned events; !inherit_event() is not an error. + * This matches with perf_event_release_kernel() removing all child events. + * + * Returns: + * - 0 on success + * - <0 on error + */ static int inherit_group(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, @@ -10534,6 +10732,11 @@ static int inherit_group(struct perf_event *parent_event, child, NULL, child_ctx); if (IS_ERR(leader)) return PTR_ERR(leader); + /* + * @leader can be NULL here because of is_orphaned_event(). In this + * case inherit_event() will create individual events, similar to what + * perf_group_detach() would do anyway. + */ list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); @@ -10543,6 +10746,17 @@ static int inherit_group(struct perf_event *parent_event, return 0; } +/* + * Creates the child task context and tries to inherit the event-group. + * + * Clears @inherited_all on !attr.inherited or error. Note that we'll leave + * inherited_all set when we 'fail' to inherit an orphaned event; this is + * consistent with perf_event_release_kernel() removing all child events. + * + * Returns: + * - 0 on success + * - <0 on error + */ static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, @@ -10565,7 +10779,6 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * First allocate and initialize a context for the * child. */ - child_ctx = alloc_perf_context(parent_ctx->pmu, child); if (!child_ctx) return -ENOMEM; @@ -10627,7 +10840,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) - break; + goto out_unlock; } /* @@ -10643,7 +10856,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) - break; + goto out_unlock; } raw_spin_lock_irqsave(&parent_ctx->lock, flags); @@ -10671,6 +10884,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) } raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); +out_unlock: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); @@ -10714,6 +10928,9 @@ static void __init perf_event_init_all_cpus(void) INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); +#ifdef CONFIG_CGROUP_PERF + INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); +#endif INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } @@ -10911,5 +11128,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .attach = perf_cgroup_attach, + /* + * Implicitly enable on dfl hierarchy so that perf events can + * always be filtered by cgroup2 path as long as perf_event + * controller is not mounted on a legacy hierarchy. + */ + .implicit_on_dfl = true, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 257fa460b846..2831480c63a2 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -297,6 +297,19 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) rb->paused = 1; } +void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) +{ + /* + * OVERWRITE is determined by perf_aux_output_end() and can't + * be passed in directly. + */ + if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE)) + return; + + handle->aux_flags |= flags; +} +EXPORT_SYMBOL_GPL(perf_aux_output_flag); + /* * This is called before hardware starts writing to the AUX area to * obtain an output handle and make sure there's room in the buffer. @@ -360,6 +373,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, handle->event = event; handle->head = aux_head; handle->size = 0; + handle->aux_flags = 0; /* * In overwrite mode, AUX data stores do not depend on aux_tail, @@ -408,34 +422,32 @@ err: * of the AUX buffer management code is that after pmu::stop(), the AUX * transaction must be stopped and therefore drop the AUX reference count. */ -void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, - bool truncated) +void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { + bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); struct ring_buffer *rb = handle->rb; - bool wakeup = truncated; unsigned long aux_head; - u64 flags = 0; - - if (truncated) - flags |= PERF_AUX_FLAG_TRUNCATED; /* in overwrite mode, driver provides aux_head via handle */ if (rb->aux_overwrite) { - flags |= PERF_AUX_FLAG_OVERWRITE; + handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; aux_head = handle->head; local_set(&rb->aux_head, aux_head); } else { + handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; + aux_head = local_read(&rb->aux_head); local_add(size, &rb->aux_head); } - if (size || flags) { + if (size || handle->aux_flags) { /* * Only send RECORD_AUX if we have something useful to communicate */ - perf_event_aux_event(handle->event, aux_head, size, flags); + perf_event_aux_event(handle->event, aux_head, size, + handle->aux_flags); } aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); @@ -446,7 +458,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, } if (wakeup) { - if (truncated) + if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) handle->event->pending_disable = 1; perf_output_wakeup(handle); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d416f3baf392..0e137f98a50c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -27,6 +27,8 @@ #include <linux/pagemap.h> /* read_mapping_page */ #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ #include <linux/mmu_notifier.h> /* set_pte_at_notify */ @@ -153,14 +155,19 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; - pte_t *ptep; + struct page_vma_mapped_walk pvmw = { + .page = old_page, + .vma = vma, + .address = addr, + }; int err; /* For mmu_notifiers */ const unsigned long mmun_start = addr; const unsigned long mmun_end = addr + PAGE_SIZE; struct mem_cgroup *memcg; + VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, false); if (err) @@ -171,11 +178,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; - ptep = page_check_address(old_page, mm, addr, &ptl, 0); - if (!ptep) { + if (!page_vma_mapped_walk(&pvmw)) { mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; } + VM_BUG_ON_PAGE(addr != pvmw.address, old_page); get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); @@ -187,14 +194,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot)); + flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); + ptep_clear_flush_notify(vma, addr, pvmw.pte); + set_pte_at_notify(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, false); if (!page_mapped(old_page)) try_to_free_swap(old_page); - pte_unmap_unlock(ptep, ptl); + page_vma_mapped_walk_done(&pvmw); if (vma->vm_flags & VM_LOCKED) munlock_vma_page(old_page); @@ -300,8 +308,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, - &vma, NULL); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, + FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -741,7 +749,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) continue; } - if (!atomic_inc_not_zero(&vma->vm_mm->mm_users)) + if (!mmget_not_zero(vma->vm_mm)) continue; info = prev; diff --git a/kernel/exit.c b/kernel/exit.c index 8f14b866f9f6..516acdb0e0ec 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -6,6 +6,12 @@ #include <linux/mm.h> #include <linux/slab.h> +#include <linux/sched/autogroup.h> +#include <linux/sched/mm.h> +#include <linux/sched/stat.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> @@ -14,7 +20,6 @@ #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> -#include <linux/security.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> @@ -46,6 +51,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/tracehook.h> #include <linux/fs_struct.h> +#include <linux/userfaultfd_k.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> @@ -55,6 +61,7 @@ #include <linux/shm.h> #include <linux/kcov.h> #include <linux/random.h> +#include <linux/rcuwait.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -86,7 +93,7 @@ static void __exit_signal(struct task_struct *tsk) bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *uninitialized_var(tty); - cputime_t utime, stime; + u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); @@ -282,6 +289,35 @@ retry: return task; } +void rcuwait_wake_up(struct rcuwait *w) +{ + struct task_struct *task; + + rcu_read_lock(); + + /* + * Order condition vs @task, such that everything prior to the load + * of @task is visible. This is the condition as to why the user called + * rcuwait_trywake() in the first place. Pairs with set_current_state() + * barrier (A) in rcuwait_wait_event(). + * + * WAIT WAKE + * [S] tsk = current [S] cond = true + * MB (A) MB (B) + * [L] cond [L] tsk + */ + smp_rmb(); /* (B) */ + + /* + * Avoid using task_rcu_dereference() magic as long as we are careful, + * see comment in rcuwait_wait_event() regarding ->exit_state. + */ + task = rcu_dereference(w->task); + if (task) + wake_up_process(task); + rcu_read_unlock(); +} + struct task_struct *try_get_task_struct(struct task_struct **ptask) { struct task_struct *task; @@ -468,12 +504,12 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct *tsk) +static void exit_mm(void) { - struct mm_struct *mm = tsk->mm; + struct mm_struct *mm = current->mm; struct core_state *core_state; - mm_release(tsk, mm); + mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); @@ -491,7 +527,7 @@ static void exit_mm(struct task_struct *tsk) up_read(&mm->mmap_sem); - self.task = tsk; + self.task = current; self.next = xchg(&core_state->dumper.next, &self); /* * Implies mb(), the result of xchg() must be visible @@ -501,22 +537,22 @@ static void exit_mm(struct task_struct *tsk) complete(&core_state->startup); for (;;) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; freezable_schedule(); } - __set_task_state(tsk, TASK_RUNNING); + __set_current_state(TASK_RUNNING); down_read(&mm->mmap_sem); } - atomic_inc(&mm->mm_count); - BUG_ON(mm != tsk->active_mm); + mmgrab(mm); + BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ - task_lock(tsk); - tsk->mm = NULL; + task_lock(current); + current->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); - task_unlock(tsk); + task_unlock(current); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) @@ -578,15 +614,18 @@ static struct task_struct *find_new_reaper(struct task_struct *father, return thread; if (father->signal->has_child_subreaper) { + unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. - * We start from father to ensure we can not look into another - * namespace, this is safe because all its threads are dead. + * We can't check reaper != child_reaper to ensure we do not + * cross the namespaces, the exiting parent could be injected + * by setns() + fork(). + * We check pid->level, this is slightly more efficient than + * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ - for (reaper = father; - !same_thread_group(reaper, child_reaper); + for (reaper = father->real_parent; + task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { - /* call_usermodehelper() descendants need this check */ if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) @@ -823,7 +862,7 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); - exit_mm(tsk); + exit_mm(); if (group_dead) acct_process(); @@ -1091,7 +1130,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) struct signal_struct *sig = p->signal; struct signal_struct *psig = current->signal; unsigned long maxrss; - cputime_t tgutime, tgstime; + u64 tgutime, tgstime; /* * The resource counters for the group leader are in its @@ -1360,7 +1399,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, - * or another error from security_task_wait(), or still -ECHILD. + * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) @@ -1380,20 +1419,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (!ret) return ret; - ret = security_task_wait(p); - if (unlikely(ret < 0)) { - /* - * If we have not yet seen any eligible child, - * then let this error code replace -ECHILD. - * A permission error will give the user a clue - * to look for security policy problems, rather - * than for mysterious wait bugs. - */ - if (wo->notask_error) - wo->notask_error = ret; - return 0; - } - if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case @@ -1486,7 +1511,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, - * or another error from security_task_wait(), or still -ECHILD. + * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { diff --git a/kernel/extable.c b/kernel/extable.c index e3beec4a2339..2676d7f8baf6 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -17,9 +17,12 @@ */ #include <linux/ftrace.h> #include <linux/memory.h> +#include <linux/extable.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/init.h> +#include <linux/kprobes.h> +#include <linux/filter.h> #include <asm/sections.h> #include <linux/uaccess.h> @@ -104,6 +107,10 @@ int __kernel_text_address(unsigned long addr) return 1; if (is_ftrace_trampoline(addr)) return 1; + if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) + return 1; + if (is_bpf_text_address(addr)) + return 1; /* * There might be init symbols in saved stacktraces. * Give those symbols a chance to be printed in @@ -123,7 +130,13 @@ int kernel_text_address(unsigned long addr) return 1; if (is_module_text_address(addr)) return 1; - return is_ftrace_trampoline(addr); + if (is_ftrace_trampoline(addr)) + return 1; + if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) + return 1; + if (is_bpf_text_address(addr)) + return 1; + return 0; } /* diff --git a/kernel/fork.c b/kernel/fork.c index 11c5c8ab827c..06d759ab4c62 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -12,6 +12,16 @@ */ #include <linux/slab.h> +#include <linux/sched/autogroup.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> +#include <linux/sched/user.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/stat.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/cputime.h> +#include <linux/rtmutex.h> #include <linux/init.h> #include <linux/unistd.h> #include <linux/module.h> @@ -55,6 +65,7 @@ #include <linux/rmap.h> #include <linux/ksm.h> #include <linux/acct.h> +#include <linux/userfaultfd_k.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/freezer.h> @@ -76,6 +87,7 @@ #include <linux/compiler.h> #include <linux/sysctl.h> #include <linux/kcov.h> +#include <linux/livepatch.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -167,6 +179,24 @@ void __weak arch_release_thread_stack(unsigned long *stack) */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); + +static int free_vm_stack_cache(unsigned int cpu) +{ + struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + int i; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *vm_stack = cached_vm_stacks[i]; + + if (!vm_stack) + continue; + + vfree(vm_stack->addr); + cached_vm_stacks[i] = NULL; + } + + return 0; +} #endif static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) @@ -191,7 +221,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END, - THREADINFO_GFP | __GFP_HIGHMEM, + THREADINFO_GFP, PAGE_KERNEL, 0, node, __builtin_return_address(0)); @@ -432,11 +462,13 @@ void __init fork_init(void) int i; #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN -#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES +#define ARCH_MIN_TASKALIGN 0 #endif + int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); + /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", - arch_task_struct_size, ARCH_MIN_TASKALIGN, + arch_task_struct_size, align, SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); #endif @@ -453,6 +485,11 @@ void __init fork_init(void) for (i = 0; i < UCOUNT_COUNTS; i++) { init_user_ns.ucount_max[i] = max_threads/2; } + +#ifdef CONFIG_VMAP_STACK + cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", + NULL, free_vm_stack_cache); +#endif } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -523,7 +560,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_int(); + tsk->stack_canary = get_random_long(); #endif /* @@ -559,6 +596,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; + LIST_HEAD(uf); uprobe_start_dup_mmap(); if (down_write_killable(&oldmm->mmap_sem)) { @@ -615,12 +653,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= - ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP); + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); tmp->vm_next = tmp->vm_prev = NULL; - tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); @@ -676,6 +715,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); return retval; @@ -994,7 +1034,7 @@ struct mm_struct *get_task_mm(struct task_struct *task) if (task->flags & PF_KTHREAD) mm = NULL; else - atomic_inc(&mm->mm_users); + mmget(mm); } task_unlock(task); return mm; @@ -1182,7 +1222,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) vmacache_flush(tsk); if (clone_flags & CLONE_VM) { - atomic_inc(&oldmm->mm_users); + mmget(oldmm); mm = oldmm; goto good_mm; } @@ -1297,13 +1337,14 @@ void __cleanup_sighand(struct sighand_struct *sighand) if (atomic_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* - * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it + * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it * without an RCU grace period, see __lock_task_sighand(). */ kmem_cache_free(sighand_cachep, sighand); } } +#ifdef CONFIG_POSIX_TIMERS /* * Initialize POSIX timer handling for a thread group. */ @@ -1313,7 +1354,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { - sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); + sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; sig->cputimer.running = true; } @@ -1322,6 +1363,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); } +#else +static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } +#endif static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { @@ -1346,11 +1390,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); - INIT_LIST_HEAD(&sig->posix_timers); seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); #ifdef CONFIG_POSIX_TIMERS + INIT_LIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; #endif @@ -1367,9 +1411,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; - sig->has_child_subreaper = current->signal->has_child_subreaper || - current->signal->is_child_subreaper; - mutex_init(&sig->cred_guard_mutex); return 0; @@ -1421,10 +1462,12 @@ static void rt_mutex_init_task(struct task_struct *p) #ifdef CONFIG_RT_MUTEXES p->pi_waiters = RB_ROOT; p->pi_waiters_leftmost = NULL; + p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif } +#ifdef CONFIG_POSIX_TIMERS /* * Initialize POSIX timer handling for a single task. */ @@ -1437,6 +1480,9 @@ static void posix_cpu_timers_init(struct task_struct *tsk) INIT_LIST_HEAD(&tsk->cpu_timers[1]); INIT_LIST_HEAD(&tsk->cpu_timers[2]); } +#else +static inline void posix_cpu_timers_init(struct task_struct *tsk) { } +#endif static inline void init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) @@ -1444,6 +1490,21 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) task->pids[type].pid = pid; } +static inline void rcu_copy_process(struct task_struct *p) +{ +#ifdef CONFIG_PREEMPT_RCU + p->rcu_read_lock_nesting = 0; + p->rcu_read_unlock_special.s = 0; + p->rcu_blocked_node = NULL; + INIT_LIST_HEAD(&p->rcu_node_entry); +#endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TASKS_RCU + p->rcu_tasks_holdout = false; + INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); + p->rcu_tasks_idle_cpu = -1; +#endif /* #ifdef CONFIG_TASKS_RCU */ +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1643,9 +1704,12 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_perf; /* copy all the process information */ shm_init_task(p); - retval = copy_semundo(clone_flags, p); + retval = security_task_alloc(p, clone_flags); if (retval) goto bad_fork_cleanup_audit; + retval = copy_semundo(clone_flags, p); + if (retval) + goto bad_fork_cleanup_security; retval = copy_files(clone_flags, p); if (retval) goto bad_fork_cleanup_semundo; @@ -1735,7 +1799,7 @@ static __latent_entropy struct task_struct *copy_process( INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - threadgroup_change_begin(current); + cgroup_threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed @@ -1761,6 +1825,8 @@ static __latent_entropy struct task_struct *copy_process( p->parent_exec_id = current->self_exec_id; } + klp_copy_process(p); + spin_lock(¤t->sighand->siglock); /* @@ -1800,6 +1866,13 @@ static __latent_entropy struct task_struct *copy_process( p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); + /* + * Inherit has_child_subreaper flag under the same + * tasklist_lock with adding child to the process tree + * for propagate_has_child_subreaper optimization. + */ + p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || + p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_PGID); @@ -1825,7 +1898,7 @@ static __latent_entropy struct task_struct *copy_process( proc_fork_connector(p); cgroup_post_fork(p); - threadgroup_change_end(current); + cgroup_threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -1836,7 +1909,7 @@ static __latent_entropy struct task_struct *copy_process( bad_fork_cancel_cgroup: cgroup_cancel_fork(p); bad_fork_free_pid: - threadgroup_change_end(current); + cgroup_threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: @@ -1860,6 +1933,8 @@ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); +bad_fork_cleanup_security: + security_task_free(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_perf: @@ -2053,6 +2128,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, } #endif +void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data) +{ + struct task_struct *leader, *parent, *child; + int res; + + read_lock(&tasklist_lock); + leader = top = top->group_leader; +down: + for_each_thread(leader, parent) { + list_for_each_entry(child, &parent->children, sibling) { + res = visitor(child, data); + if (res) { + if (res < 0) + goto out; + leader = child; + goto down; + } +up: + ; + } + } + + if (leader != top) { + child = leader; + parent = child->real_parent; + leader = parent->group_leader; + goto up; + } +out: + read_unlock(&tasklist_lock); +} + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif @@ -2069,7 +2176,7 @@ void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, @@ -2277,6 +2384,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } } + perf_event_namespaces(current); + bad_unshare_cleanup_cred: if (new_cred) put_cred(new_cred); diff --git a/kernel/futex.c b/kernel/futex.c index cdf365036141..357348a6cf6b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -61,6 +61,8 @@ #include <linux/nsproxy.h> #include <linux/ptrace.h> #include <linux/sched/rt.h> +#include <linux/sched/wake_q.h> +#include <linux/sched/mm.h> #include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/bootmem.h> @@ -338,7 +340,7 @@ static inline bool should_fail_futex(bool fshared) static inline void futex_get_mm(union futex_key *key) { - atomic_inc(&key->private.mm->mm_count); + mmgrab(key->private.mm); /* * Ensure futex_get_mm() implies a full barrier such that * get_futex_key() implies a full barrier. This is relied upon @@ -800,7 +802,7 @@ static int refill_pi_state_cache(void) return 0; } -static struct futex_pi_state * alloc_pi_state(void) +static struct futex_pi_state *alloc_pi_state(void) { struct futex_pi_state *pi_state = current->pi_state_cache; @@ -810,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +static void get_pi_state(struct futex_pi_state *pi_state) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); +} + /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. @@ -854,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) * Look up the task based on what TID userspace gave us. * We dont trust it. */ -static struct task_struct * futex_find_get_task(pid_t pid) +static struct task_struct *futex_find_get_task(pid_t pid) { struct task_struct *p; @@ -914,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr) pi_state->owner = NULL; raw_spin_unlock_irq(&curr->pi_lock); - rt_mutex_unlock(&pi_state->pi_mutex); - + get_pi_state(pi_state); spin_unlock(&hb->lock); + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); @@ -971,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr) * * [10] There is no transient state which leaves owner and user space * TID out of sync. + * + * + * Serialization and lifetime rules: + * + * hb->lock: + * + * hb -> futex_q, relation + * futex_q -> pi_state, relation + * + * (cannot be raw because hb can contain arbitrary amount + * of futex_q's) + * + * pi_mutex->wait_lock: + * + * {uval, pi_state} + * + * (and pi_mutex 'obviously') + * + * p->pi_lock: + * + * p->pi_state_list -> pi_state->list, relation + * + * pi_state->refcount: + * + * pi_state lifetime + * + * + * Lock order: + * + * hb->lock + * pi_mutex->wait_lock + * p->pi_lock + * */ /* @@ -978,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr) * the pi_state against the user space value. If correct, attach to * it. */ -static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, +static int attach_to_pi_state(u32 __user *uaddr, u32 uval, + struct futex_pi_state *pi_state, struct futex_pi_state **ps) { pid_t pid = uval & FUTEX_TID_MASK; + u32 uval2; + int ret; /* * Userspace might have messed up non-PI and PI futexes [3] @@ -989,9 +1034,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, if (unlikely(!pi_state)) return -EINVAL; + /* + * We get here with hb->lock held, and having found a + * futex_top_waiter(). This means that futex_lock_pi() of said futex_q + * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), + * which in turn means that futex_lock_pi() still has a reference on + * our pi_state. + * + * The waiter holding a reference on @pi_state also protects against + * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() + * and futex_wait_requeue_pi() as it cannot go to 0 and consequently + * free pi_state before we can take a reference ourselves. + */ WARN_ON(!atomic_read(&pi_state->refcount)); /* + * Now that we have a pi_state, we can acquire wait_lock + * and do the state validation. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Since {uval, pi_state} is serialized by wait_lock, and our current + * uval was read without holding it, it can have changed. Verify it + * still is what we expect it to be, otherwise retry the entire + * operation. + */ + if (get_futex_value_locked(&uval2, uaddr)) + goto out_efault; + + if (uval != uval2) + goto out_eagain; + + /* * Handle the owner died case: */ if (uval & FUTEX_OWNER_DIED) { @@ -1006,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * is not 0. Inconsistent state. [5] */ if (pid) - return -EINVAL; + goto out_einval; /* * Take a ref on the state and return success. [4] */ - goto out_state; + goto out_attach; } /* @@ -1022,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * Take a ref on the state and return success. [6] */ if (!pid) - goto out_state; + goto out_attach; } else { /* * If the owner died bit is not set, then the pi_state * must have an owner. [7] */ if (!pi_state->owner) - return -EINVAL; + goto out_einval; } /* @@ -1038,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * user space TID. [9/10] */ if (pid != task_pid_vnr(pi_state->owner)) - return -EINVAL; -out_state: - atomic_inc(&pi_state->refcount); + goto out_einval; + +out_attach: + get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); *ps = pi_state; return 0; + +out_einval: + ret = -EINVAL; + goto out_error; + +out_eagain: + ret = -EAGAIN; + goto out_error; + +out_efault: + ret = -EFAULT; + goto out_error; + +out_error: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; } /* @@ -1093,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, /* * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. */ pi_state = alloc_pi_state(); @@ -1117,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, return 0; } -static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, +static int lookup_pi_state(u32 __user *uaddr, u32 uval, + struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps) { - struct futex_q *match = futex_top_waiter(hb, key); + struct futex_q *top_waiter = futex_top_waiter(hb, key); /* * If there is a waiter on that futex, validate it and * attach to the pi_state when the validation succeeds. */ - if (match) - return attach_to_pi_state(uval, match->pi_state, ps); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); /* * We are the first waiter - try to look up the owner based on @@ -1146,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; - /*If user space value changed, let the caller retry */ + /* If user space value changed, let the caller retry */ return curval != uval ? -EAGAIN : 0; } @@ -1174,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct task_struct *task, int set_waiters) { u32 uval, newval, vpid = task_pid_vnr(task); - struct futex_q *match; + struct futex_q *top_waiter; int ret; /* @@ -1200,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * Lookup existing state first. If it exists, try to attach to * its pi_state. */ - match = futex_top_waiter(hb, key); - if (match) - return attach_to_pi_state(uval, match->pi_state, ps); + top_waiter = futex_top_waiter(hb, key); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); /* * No waiter and user TID is 0. We are here because the @@ -1283,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) wake_q_add(wake_q, p); __unqueue_futex(q); /* - * The waiting task can free the futex_q as soon as - * q->lock_ptr = NULL is written, without taking any locks. A - * memory barrier is required here to prevent the following - * store to lock_ptr from getting ahead of the plist_del. + * The waiting task can free the futex_q as soon as q->lock_ptr = NULL + * is written, without taking any locks. This is possible in the event + * of a spurious wakeup, for example. A memory barrier is required here + * to prevent the following store to lock_ptr from getting ahead of the + * plist_del in __unqueue_futex(). */ - smp_wmb(); - q->lock_ptr = NULL; + smp_store_release(&q->lock_ptr, NULL); } -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, - struct futex_hash_bucket *hb) +/* + * Caller must hold a reference on @pi_state. + */ +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - struct task_struct *new_owner; - struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + struct task_struct *new_owner; + bool postunlock = false; DEFINE_WAKE_Q(wake_q); - bool deboost; int ret = 0; - if (!pi_state) - return -EINVAL; - - /* - * If current does not own the pi_state then the futex is - * inconsistent and user space fiddled with the futex value. - */ - if (pi_state->owner != current) - return -EINVAL; - - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + if (WARN_ON_ONCE(!new_owner)) { + /* + * As per the comment in futex_unlock_pi() this should not happen. + * + * When this happens, give up our locks and try again, giving + * the futex_lock_pi() instance time to complete, either by + * waiting on the rtmutex or removing itself from the futex + * queue. + */ + ret = -EAGAIN; + goto out_unlock; + } /* - * It is possible that the next waiter (the one that brought - * this owner to the kernel) timed out and is no longer - * waiting on the lock. - */ - if (!new_owner) - new_owner = this->task; - - /* - * We pass it to the next owner. The WAITERS bit is always - * kept enabled while there is PI state around. We cleanup the - * owner died bit, because we are the owner. + * We pass it to the next owner. The WAITERS bit is always kept + * enabled while there is PI state around. We cleanup the owner + * died bit, because we are the owner. */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); @@ -1335,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { ret = -EFAULT; + } else if (curval != uval) { /* * If a unconditional UNLOCK_PI operation (user space did not @@ -1347,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else ret = -EINVAL; } - if (ret) { - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - return ret; - } + + if (ret) + goto out_unlock; + + /* + * This is a point of no return; once we modify the uval there is no + * going back and subsequent operations must not fail. + */ raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); @@ -1363,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, pi_state->owner = new_owner; raw_spin_unlock(&new_owner->pi_lock); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); - deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - /* - * First unlock HB so the waiter does not spin on it once he got woken - * up. Second wake up the waiter before the priority is adjusted. If we - * deboost first (and lose our higher priority), then the task might get - * scheduled away before the wake up can take place. - */ - spin_unlock(&hb->lock); - wake_up_q(&wake_q); - if (deboost) - rt_mutex_adjust_prio(current); + if (postunlock) + rt_mutex_postunlock(&wake_q); - return 0; + return ret; } /* @@ -1824,7 +1913,7 @@ retry_private: * If that call succeeds then we have pi_state and an * initial refcount on it. */ - ret = lookup_pi_state(ret, hb2, &key2, &pi_state); + ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); } switch (ret) { @@ -1907,7 +1996,7 @@ retry_private: * refcount on the pi_state and store the pointer in * the futex_q object of the waiter. */ - atomic_inc(&pi_state->refcount); + get_pi_state(pi_state); this->pi_state = pi_state; ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, @@ -2007,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb) hb_waiters_dec(hb); } -/** - * queue_me() - Enqueue the futex_q on the futex_hash_bucket - * @q: The futex_q to enqueue - * @hb: The destination hash bucket - * - * The hb->lock must be held by the caller, and is released here. A call to - * queue_me() is typically paired with exactly one call to unqueue_me(). The - * exceptions involve the PI related operations, which may use unqueue_me_pi() - * or nothing if the unqueue is done as part of the wake process and the unqueue - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for - * an example). - */ -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - __releases(&hb->lock) +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -2037,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = current; +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + __queue_me(q, hb); spin_unlock(&hb->lock); } @@ -2123,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; - struct task_struct *oldowner = pi_state->owner; u32 uval, uninitialized_var(curval), newval; + struct task_struct *oldowner; int ret; + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + oldowner = pi_state->owner; /* Owner died? */ if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED; @@ -2134,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * We are here either because we stole the rtmutex from the * previous highest priority waiter or we are the highest priority - * waiter but failed to get the rtmutex the first time. + * waiter but have failed to get the rtmutex the first time. + * * We have to replace the newowner TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * @@ -2142,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * because we can fault here. Imagine swapped out pages or a fork * that marked all the anonymous memory readonly for cow. * - * Modifying pi_state _before_ the user space value would - * leave the pi_state in an inconsistent state when we fault - * here, because we need to drop the hash bucket lock to - * handle the fault. This might be observed in the PID check - * in lookup_pi_state. + * Modifying pi_state _before_ the user space value would leave the + * pi_state in an inconsistent state when we fault here, because we + * need to drop the locks to handle the fault. This might be observed + * in the PID check in lookup_pi_state. */ retry: if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; - while (1) { + for (;;) { newval = (uval & FUTEX_OWNER_DIED) | newtid; if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) @@ -2167,47 +2264,60 @@ retry: * itself. */ if (pi_state->owner != NULL) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); } pi_state->owner = newowner; - raw_spin_lock_irq(&newowner->pi_lock); + raw_spin_lock(&newowner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock_irq(&newowner->pi_lock); + raw_spin_unlock(&newowner->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return 0; /* - * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the highest priority - * waiter itself or the task which stole the rtmutex) the - * chance to try the fixup of the pi_state. So once we are - * back from handling the fault we need to check the pi_state - * after reacquiring the hash bucket lock and before trying to - * do another fixup. When the fixup has been done already we - * simply return. + * To handle the page fault we need to drop the locks here. That gives + * the other task (either the highest priority waiter itself or the + * task which stole the rtmutex) the chance to try the fixup of the + * pi_state. So once we are back from handling the fault we need to + * check the pi_state after reacquiring the locks and before trying to + * do another fixup. When the fixup has been done already we simply + * return. + * + * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely + * drop hb->lock since the caller owns the hb -> futex_q relation. + * Dropping the pi_mutex->wait_lock requires the state revalidate. */ handle_fault: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(q->lock_ptr); ret = fault_in_user_writeable(uaddr); spin_lock(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); /* * Check if someone else fixed it for us: */ - if (pi_state->owner != oldowner) - return 0; + if (pi_state->owner != oldowner) { + ret = 0; + goto out_unlock; + } if (ret) - return ret; + goto out_unlock; goto retry; + +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; } static long futex_wait_restart(struct restart_block *restart); @@ -2229,13 +2339,16 @@ static long futex_wait_restart(struct restart_block *restart); */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { - struct task_struct *owner; int ret = 0; if (locked) { /* * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: + * + * We can safely read pi_state->owner without holding wait_lock + * because we now own the rt_mutex, only the owner will attempt + * to change it. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); @@ -2243,43 +2356,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) } /* - * Catch the rare case, where the lock was released when we were on the - * way back before we locked the hash bucket. - */ - if (q->pi_state->owner == current) { - /* - * Try to get the rt_mutex now. This might fail as some other - * task acquired the rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { - locked = 1; - goto out; - } - - /* - * pi_state is incorrect, some other task did a lock steal and - * we returned due to timeout or signal without taking the - * rt_mutex. Too late. - */ - raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); - owner = rt_mutex_owner(&q->pi_state->pi_mutex); - if (!owner) - owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); - ret = fixup_pi_state_owner(uaddr, q, owner); - goto out; - } - - /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " "pi-state %p\n", ret, q->pi_state->pi_mutex.owner, q->pi_state->owner); + } out: return ret ? ret : locked; @@ -2503,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; + struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; int res, ret; @@ -2555,25 +2642,68 @@ retry_private: } } + WARN_ON(!q.pi_state); + /* * Only actually queue now that the atomic ops are done: */ - queue_me(&q, hb); + __queue_me(&q, hb); - WARN_ON(!q.pi_state); - /* - * Block on the PI mutex: - */ - if (!trylock) { - ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); - } else { - ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); /* Fixup the trylock return value: */ ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; } + rt_mutex_init_waiter(&rt_waiter); + + /* + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling rt_mutex_start_proxy_lock(). This still fully + * serializes against futex_unlock_pi() as that does the exact same + * lock handoff sequence. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + + if (ret) { + if (ret == 1) + ret = 0; + + spin_lock(q.lock_ptr); + goto no_block; + } + + + if (unlikely(to)) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); + + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + spin_lock(q.lock_ptr); /* + * If we failed to acquire the lock (signal/timeout), we must + * first acquire the hb->lock before removing the lock from the + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex + * wait lists consistent. + * + * In particular; it is important that futex_unlock_pi() can not + * observe this inconsistency. + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; + +no_block: + /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ @@ -2589,12 +2719,19 @@ retry_private: * If fixup_owner() faulted and was unable to handle the fault, unlock * it and return the fault to userspace. */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) - rt_mutex_unlock(&q.pi_state->pi_mutex); + if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* Unqueue and drop the lock */ unqueue_me_pi(&q); + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + goto out_put_key; out_unlock_put_key: @@ -2603,8 +2740,10 @@ out_unlock_put_key: out_put_key: put_futex_key(&q.key); out: - if (to) + if (to) { + hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); + } return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: @@ -2631,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; struct futex_hash_bucket *hb; - struct futex_q *match; + struct futex_q *top_waiter; int ret; retry: @@ -2655,12 +2794,37 @@ retry: * all and we at least want to know if user space fiddled * with the futex value instead of blindly unlocking. */ - match = futex_top_waiter(hb, &key); - if (match) { - ret = wake_futex_pi(uaddr, uval, match, hb); + top_waiter = futex_top_waiter(hb, &key); + if (top_waiter) { + struct futex_pi_state *pi_state = top_waiter->pi_state; + + ret = -EINVAL; + if (!pi_state) + goto out_unlock; + + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + goto out_unlock; + + get_pi_state(pi_state); /* - * In case of success wake_futex_pi dropped the hash - * bucket lock. + * By taking wait_lock while still holding hb->lock, we ensure + * there is no point where we hold neither; and therefore + * wake_futex_pi() must observe a state consistent with what we + * observed. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + + ret = wake_futex_pi(uaddr, uval, pi_state); + + put_pi_state(pi_state); + + /* + * Success, we're done! No tricky corner cases. */ if (!ret) goto out_putkey; @@ -2675,7 +2839,6 @@ retry: * setting the FUTEX_WAITERS bit. Try again. */ if (ret == -EAGAIN) { - spin_unlock(&hb->lock); put_futex_key(&key); goto retry; } @@ -2683,7 +2846,7 @@ retry: * wake_futex_pi has detected invalid state. Tell user * space. */ - goto out_unlock; + goto out_putkey; } /* @@ -2693,8 +2856,10 @@ retry: * preserve the WAITERS bit not the OWNER_DIED one. We are the * owner. */ - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { + spin_unlock(&hb->lock); goto pi_faulted; + } /* * If uval has changed, let user space handle it. @@ -2708,7 +2873,6 @@ out_putkey: return ret; pi_faulted: - spin_unlock(&hb->lock); put_futex_key(&key); ret = fault_in_user_writeable(uaddr); @@ -2812,8 +2976,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; - struct rt_mutex *pi_mutex = NULL; struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; @@ -2839,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * The waiter is allocated on our stack, manipulated by the requeue * code while we sleep on uaddr. */ - debug_rt_mutex_init_waiter(&rt_waiter); - RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); - RB_CLEAR_NODE(&rt_waiter.tree_entry); - rt_waiter.task = NULL; + rt_mutex_init_waiter(&rt_waiter); ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) @@ -2897,6 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. @@ -2905,6 +3070,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, spin_unlock(q.lock_ptr); } } else { + struct rt_mutex *pi_mutex; + /* * We have been woken up by futex_unlock_pi(), a timeout, or a * signal. futex_unlock_pi() will not destroy the lock_ptr nor @@ -2912,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); - debug_rt_mutex_free_waiter(&rt_waiter); + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); spin_lock(q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + + debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. @@ -2928,18 +3098,26 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (res) ret = (res < 0) ? res : 0; + /* + * If fixup_pi_state_owner() faulted and was unable to handle + * the fault, unlock the rt_mutex and return the fault to + * userspace. + */ + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } + /* Unqueue and drop the lock. */ unqueue_me_pi(&q); } - /* - * If fixup_pi_state_owner() faulted and was unable to handle the - * fault, unlock the rt_mutex and return the fault to userspace. - */ - if (ret == -EFAULT) { - if (pi_mutex && rt_mutex_owner(pi_mutex) == current) - rt_mutex_unlock(pi_mutex); - } else if (ret == -EINTR) { + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + + if (ret == -EINTR) { /* * We've already been requeued, but cannot restart by calling * futex_lock_pi() directly. We could restart this syscall, but diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 2f9df37940a0..c51a49c9be70 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_icall_topn); +void __gcov_exit(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); + /** * gcov_enable_events - enable event reporting through gcov_event() * diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 6a5c239c7669..46a18e72bce6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include <linux/vmalloc.h> #include "gcov.h" -#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) +#if (__GNUC__ >= 7) +#define GCOV_COUNTERS 9 +#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 diff --git a/kernel/groups.c b/kernel/groups.c index 8dd7a61b7115..d09727692a2a 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -18,7 +18,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); if (!gi) return NULL; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 40c07e4fa116..751593ed7c0b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -16,6 +16,9 @@ #include <linux/export.h> #include <linux/sysctl.h> #include <linux/utsname.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> + #include <trace/events/sched.h> /* @@ -40,6 +43,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; +static bool hung_task_show_lock; static struct task_struct *watchdog_task; @@ -117,12 +121,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_all_locks(); + hung_task_show_lock = true; } touch_nmi_watchdog(); if (sysctl_hung_task_panic) { + if (hung_task_show_lock) + debug_show_all_locks(); trigger_all_cpu_backtrace(); panic("hung_task: blocked tasks"); } @@ -169,6 +175,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; + hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { if (!max_count--) @@ -184,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } unlock: rcu_read_unlock(); + if (hung_task_show_lock) + debug_show_all_locks(); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4544b115f5eb..e2d356dd7581 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -59,7 +59,7 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) struct cpumask * irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { - int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec; + int n, nodes, cpus_per_vec, extra_vecs, curvec; int affv = nvecs - affd->pre_vectors - affd->post_vectors; int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; @@ -94,19 +94,21 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) goto done; } - /* Spread the vectors per node */ - vecs_per_node = affv / nodes; - /* Account for rounding errors */ - extra_vecs = affv - (nodes * vecs_per_node); - for_each_node_mask(n, nodemsk) { - int ncpus, v, vecs_to_assign = vecs_per_node; + int ncpus, v, vecs_to_assign, vecs_per_node; + + /* Spread the vectors per node */ + vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n)); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); + vecs_to_assign = min(vecs_per_node, ncpus); + + /* Account for rounding errors */ + extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign); for (v = 0; curvec < last_affv && v < vecs_to_assign; curvec++, v++) { @@ -115,14 +117,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Account for extra vectors to compensate rounding errors */ if (extra_vecs) { cpus_per_vec++; - if (!--extra_vecs) - vecs_per_node++; + --extra_vecs; } irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); } if (curvec >= last_affv) break; + --nodes; } done: diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index be3c34e4f2ac..686be4b73018 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -348,7 +348,10 @@ void handle_nested_irq(unsigned int irq) irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); - action_ret = action->thread_fn(action->irq, action->dev_id); + action_ret = IRQ_NONE; + for_each_action_of_desc(desc, action) + action_ret |= action->thread_fn(action->irq, action->dev_id); + if (!noirqdebug) note_interrupt(desc, action_ret); diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 74d90a754268..1613bfd48365 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -2,6 +2,7 @@ #include <linux/interrupt.h> #include <linux/device.h> #include <linux/gfp.h> +#include <linux/irq.h> /* * Device resource management aware IRQ request/free implementation. @@ -33,7 +34,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data) * @thread_fn: function to be called in a threaded interrupt context. NULL * for devices which handle everything in @handler * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device + * @devname: An ascii name for the claiming device, dev_name(dev) if NULL * @dev_id: A cookie passed back to the handler function * * Except for the extra @dev argument, this function takes the @@ -57,6 +58,9 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq, if (!dr) return -ENOMEM; + if (!devname) + devname = dev_name(dev); + rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname, dev_id); if (rc) { @@ -80,7 +84,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq); * @thread_fn: function to be called in a threaded interrupt context. NULL * for devices which handle everything in @handler * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device + * @devname: An ascii name for the claiming device, dev_name(dev) if NULL * @dev_id: A cookie passed back to the handler function * * Except for the extra @dev argument, this function takes the @@ -103,6 +107,9 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq, if (!dr) return -ENOMEM; + if (!devname) + devname = dev_name(dev); + rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id); if (rc < 0) { devres_free(dr); @@ -137,3 +144,57 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) free_irq(irq, dev_id); } EXPORT_SYMBOL(devm_free_irq); + +struct irq_desc_devres { + unsigned int from; + unsigned int cnt; +}; + +static void devm_irq_desc_release(struct device *dev, void *res) +{ + struct irq_desc_devres *this = res; + + irq_free_descs(this->from, this->cnt); +} + +/** + * __devm_irq_alloc_descs - Allocate and initialize a range of irq descriptors + * for a managed device + * @dev: Device to allocate the descriptors for + * @irq: Allocate for specific irq number if irq >= 0 + * @from: Start the search from this irq number + * @cnt: Number of consecutive irqs to allocate + * @node: Preferred node on which the irq descriptor should be allocated + * @owner: Owning module (can be NULL) + * @affinity: Optional pointer to an affinity mask array of size @cnt + * which hints where the irq descriptors should be allocated + * and which default affinities to use + * + * Returns the first irq number or error code. + * + * Note: Use the provided wrappers (devm_irq_alloc_desc*) for simplicity. + */ +int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, + unsigned int cnt, int node, struct module *owner, + const struct cpumask *affinity) +{ + struct irq_desc_devres *dr; + int base; + + dr = devres_alloc(devm_irq_desc_release, sizeof(*dr), GFP_KERNEL); + if (!dr) + return -ENOMEM; + + base = __irq_alloc_descs(irq, from, cnt, node, owner, affinity); + if (base < 0) { + devres_free(dr); + return base; + } + + dr->from = base; + dr->cnt = cnt; + devres_add(dev, dr); + + return base; +} +EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index b59e6768c5e9..31805f237396 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -278,6 +278,31 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); /** + * irq_domain_check_msi_remap - Check whether all MSI irq domains implement + * IRQ remapping + * + * Return: false if any MSI irq domain does not support IRQ remapping, + * true otherwise (including if there is no MSI irq domain) + */ +bool irq_domain_check_msi_remap(void) +{ + struct irq_domain *h; + bool ret = true; + + mutex_lock(&irq_domain_mutex); + list_for_each_entry(h, &irq_domain_list, link) { + if (irq_domain_is_msi(h) && + !irq_domain_hierarchical_is_msi_remap(h)) { + ret = false; + break; + } + } + mutex_unlock(&irq_domain_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap); + +/** * irq_set_default_host() - Set a "default" irq domain * @domain: default domain pointer * @@ -1408,6 +1433,20 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain) if (domain->ops->alloc) domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; } + +/** + * irq_domain_hierarchical_is_msi_remap - Check if the domain or any + * parent has MSI remapping support + * @domain: domain pointer + */ +bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain) +{ + for (; domain; domain = domain->parent) { + if (irq_domain_is_msi_remap(domain)) + return true; + } + return false; +} #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ /** * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6b669593e7eb..070be980c37a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -17,6 +17,8 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/rt.h> +#include <linux/sched/task.h> +#include <uapi/linux/sched/types.h> #include <linux/task_work.h> #include "internals.h" @@ -353,7 +355,7 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) return 0; /* - * Preserve the managed affinity setting and an userspace affinity + * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. */ if (irqd_affinity_is_managed(&desc->irq_data) || @@ -850,7 +852,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ - if (desc->irq_common_data.affinity) + if (cpumask_available(desc->irq_common_data.affinity)) cpumask_copy(mask, desc->irq_common_data.affinity); else valid = false; @@ -1210,8 +1212,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * set the trigger type must match. Also all must * agree on ONESHOT. */ + unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data); + if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || + (oldtype != (new->flags & IRQF_TRIGGER_MASK)) || ((old->flags ^ new->flags) & IRQF_ONESHOT)) goto mismatch; @@ -1555,7 +1559,7 @@ void remove_irq(unsigned int irq, struct irqaction *act) struct irq_desc *desc = irq_to_desc(irq); if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) - __free_irq(irq, act->dev_id); + __free_irq(irq, act->dev_id); } EXPORT_SYMBOL_GPL(remove_irq); @@ -1572,20 +1576,27 @@ EXPORT_SYMBOL_GPL(remove_irq); * have completed. * * This function must not be called from interrupt context. + * + * Returns the devname argument passed to request_irq. */ -void free_irq(unsigned int irq, void *dev_id) +const void *free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + const char *devname; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) - return; + return NULL; #ifdef CONFIG_SMP if (WARN_ON(desc->affinity_notify)) desc->affinity_notify = NULL; #endif - kfree(__free_irq(irq, dev_id)); + action = __free_irq(irq, dev_id); + devname = action->name; + kfree(action); + return devname; } EXPORT_SYMBOL(free_irq); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index ee230063f033..ddc2f5427f75 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -270,8 +270,8 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) msi_domain_update_chip_ops(info); - return irq_domain_create_hierarchy(parent, 0, 0, fwnode, - &msi_domain_ops, info); + return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, + fwnode, &msi_domain_ops, info); } int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index feaa813b84a9..c53edad7b459 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -487,6 +487,8 @@ int show_interrupts(struct seq_file *p, void *v) } if (desc->irq_data.domain) seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq); + else + seq_printf(p, " %*s", prec, ""); #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); #endif diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 5707f97a3e6a..061ba7eed4ed 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -175,7 +175,9 @@ out: static inline int bad_action_ret(irqreturn_t action_ret) { - if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD))) + unsigned int r = action_ret; + + if (likely(r <= (IRQ_HANDLED | IRQ_WAKE_THREAD))) return 0; return 1; } diff --git a/kernel/jump_label.c b/kernel/jump_label.c index a9b8cf500591..6c9cb208ac48 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -236,12 +236,28 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry static inline struct jump_entry *static_key_entries(struct static_key *key) { - return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK); + WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED); + return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK); } static inline bool static_key_type(struct static_key *key) { - return (unsigned long)key->entries & JUMP_TYPE_MASK; + return key->type & JUMP_TYPE_TRUE; +} + +static inline bool static_key_linked(struct static_key *key) +{ + return key->type & JUMP_TYPE_LINKED; +} + +static inline void static_key_clear_linked(struct static_key *key) +{ + key->type &= ~JUMP_TYPE_LINKED; +} + +static inline void static_key_set_linked(struct static_key *key) +{ + key->type |= JUMP_TYPE_LINKED; } static inline struct static_key *jump_entry_key(struct jump_entry *entry) @@ -254,6 +270,26 @@ static bool jump_entry_branch(struct jump_entry *entry) return (unsigned long)entry->key & 1UL; } +/*** + * A 'struct static_key' uses a union such that it either points directly + * to a table of 'struct jump_entry' or to a linked list of modules which in + * turn point to 'struct jump_entry' tables. + * + * The two lower bits of the pointer are used to keep track of which pointer + * type is in use and to store the initial branch direction, we use an access + * function which preserves these bits. + */ +static void static_key_set_entries(struct static_key *key, + struct jump_entry *entries) +{ + unsigned long type; + + WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK); + type = key->type & JUMP_TYPE_MASK; + key->entries = entries; + key->type |= type; +} + static enum jump_label_type jump_label_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); @@ -313,13 +349,7 @@ void __init jump_label_init(void) continue; key = iterk; - /* - * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. - */ - *((unsigned long *)&key->entries) += (unsigned long)iter; -#ifdef CONFIG_MODULES - key->next = NULL; -#endif + static_key_set_entries(key, iter); } static_key_initialized = true; jump_label_unlock(); @@ -343,6 +373,29 @@ struct static_key_mod { struct module *mod; }; +static inline struct static_key_mod *static_key_mod(struct static_key *key) +{ + WARN_ON_ONCE(!(key->type & JUMP_TYPE_LINKED)); + return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK); +} + +/*** + * key->type and key->next are the same via union. + * This sets key->next and preserves the type bits. + * + * See additional comments above static_key_set_entries(). + */ +static void static_key_set_mod(struct static_key *key, + struct static_key_mod *mod) +{ + unsigned long type; + + WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK); + type = key->type & JUMP_TYPE_MASK; + key->next = mod; + key->type |= type; +} + static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; @@ -365,11 +418,23 @@ static void __jump_label_mod_update(struct static_key *key) { struct static_key_mod *mod; - for (mod = key->next; mod; mod = mod->next) { - struct module *m = mod->mod; + for (mod = static_key_mod(key); mod; mod = mod->next) { + struct jump_entry *stop; + struct module *m; + + /* + * NULL if the static_key is defined in a module + * that does not use it + */ + if (!mod->entries) + continue; - __jump_label_update(key, mod->entries, - m->jump_entries + m->num_jump_entries); + m = mod->mod; + if (!m) + stop = __stop___jump_table; + else + stop = m->jump_entries + m->num_jump_entries; + __jump_label_update(key, mod->entries, stop); } } @@ -404,7 +469,7 @@ static int jump_label_add_module(struct module *mod) struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; - struct static_key_mod *jlm; + struct static_key_mod *jlm, *jlm2; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) @@ -421,20 +486,32 @@ static int jump_label_add_module(struct module *mod) key = iterk; if (within_module(iter->key, mod)) { - /* - * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. - */ - *((unsigned long *)&key->entries) += (unsigned long)iter; - key->next = NULL; + static_key_set_entries(key, iter); continue; } jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; + if (!static_key_linked(key)) { + jlm2 = kzalloc(sizeof(struct static_key_mod), + GFP_KERNEL); + if (!jlm2) { + kfree(jlm); + return -ENOMEM; + } + preempt_disable(); + jlm2->mod = __module_address((unsigned long)key); + preempt_enable(); + jlm2->entries = static_key_entries(key); + jlm2->next = NULL; + static_key_set_mod(key, jlm2); + static_key_set_linked(key); + } jlm->mod = mod; jlm->entries = iter; - jlm->next = key->next; - key->next = jlm; + jlm->next = static_key_mod(key); + static_key_set_mod(key, jlm); + static_key_set_linked(key); /* Only update if we've changed from our initial state */ if (jump_label_type(iter) != jump_label_init_type(iter)) @@ -461,16 +538,34 @@ static void jump_label_del_module(struct module *mod) if (within_module(iter->key, mod)) continue; + /* No memory during module load */ + if (WARN_ON(!static_key_linked(key))) + continue; + prev = &key->next; - jlm = key->next; + jlm = static_key_mod(key); while (jlm && jlm->mod != mod) { prev = &jlm->next; jlm = jlm->next; } - if (jlm) { + /* No memory during module load */ + if (WARN_ON(!jlm)) + continue; + + if (prev == &key->next) + static_key_set_mod(key, jlm->next); + else *prev = jlm->next; + + kfree(jlm); + + jlm = static_key_mod(key); + /* if only one etry is left, fold it back into the static_key */ + if (jlm->next == NULL) { + static_key_set_entries(key, jlm->entries); + static_key_clear_linked(key); kfree(jlm); } } @@ -499,8 +594,10 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, case MODULE_STATE_COMING: jump_label_lock(); ret = jump_label_add_module(mod); - if (ret) + if (ret) { + WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); jump_label_del_module(mod); + } jump_label_unlock(); break; case MODULE_STATE_GOING: @@ -561,11 +658,14 @@ int jump_label_text_reserved(void *start, void *end) static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; - struct jump_entry *entry = static_key_entries(key); + struct jump_entry *entry; #ifdef CONFIG_MODULES struct module *mod; - __jump_label_mod_update(key); + if (static_key_linked(key)) { + __jump_label_mod_update(key); + return; + } preempt_disable(); mod = __module_address((unsigned long)key); @@ -573,6 +673,7 @@ static void jump_label_update(struct static_key *key) stop = mod->jump_entries + mod->num_jump_entries; preempt_enable(); #endif + entry = static_key_entries(key); /* if there are no users, entry can be NULL */ if (entry) __jump_label_update(key, entry, stop); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fafd1a3ef0da..6a3b249a2ae1 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -23,6 +23,7 @@ #include <linux/mm.h> #include <linux/ctype.h> #include <linux/slab.h> +#include <linux/filter.h> #include <linux/compiler.h> #include <asm/sections.h> @@ -300,10 +301,11 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; + if (is_ksym_addr(addr)) return !!get_symbol_pos(addr, symbolsize, offset); - - return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf); + return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || + !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } /* @@ -318,6 +320,8 @@ const char *kallsyms_lookup(unsigned long addr, unsigned long *offset, char **modname, char *namebuf) { + const char *ret; + namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; @@ -333,9 +337,13 @@ const char *kallsyms_lookup(unsigned long addr, return namebuf; } - /* See if it's in a module. */ - return module_address_lookup(addr, symbolsize, offset, modname, - namebuf); + /* See if it's in a module or a BPF JITed image. */ + ret = module_address_lookup(addr, symbolsize, offset, + modname, namebuf); + if (!ret) + ret = bpf_address_lookup(addr, symbolsize, + offset, modname, namebuf); + return ret; } int lookup_symbol_name(unsigned long addr, char *symname) @@ -471,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol); /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; + loff_t pos_mod_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -481,13 +490,27 @@ struct kallsym_iter { static int get_ksymbol_mod(struct kallsym_iter *iter) { - if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, - &iter->type, iter->name, iter->module_name, - &iter->exported) < 0) + int ret = module_get_kallsym(iter->pos - kallsyms_num_syms, + &iter->value, &iter->type, + iter->name, iter->module_name, + &iter->exported); + if (ret < 0) { + iter->pos_mod_end = iter->pos; return 0; + } + return 1; } +static int get_ksymbol_bpf(struct kallsym_iter *iter) +{ + iter->module_name[0] = '\0'; + iter->exported = 0; + return bpf_get_kallsym(iter->pos - iter->pos_mod_end, + &iter->value, &iter->type, + iter->name) < 0 ? 0 : 1; +} + /* Returns space to next name. */ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { @@ -508,16 +531,30 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->name[0] = '\0'; iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; + if (new_pos == 0) + iter->pos_mod_end = 0; +} + +static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) +{ + iter->pos = pos; + + if (iter->pos_mod_end > 0 && + iter->pos_mod_end < iter->pos) + return get_ksymbol_bpf(iter); + + if (!get_ksymbol_mod(iter)) + return get_ksymbol_bpf(iter); + + return 1; } /* Returns false if pos at or past end of file. */ static int update_iter(struct kallsym_iter *iter, loff_t pos) { /* Module symbols can be accessed randomly. */ - if (pos >= kallsyms_num_syms) { - iter->pos = pos; - return get_ksymbol_mod(iter); - } + if (pos >= kallsyms_num_syms) + return update_iter_mod(iter, pos); /* If we're not on the desired position, reset to new position. */ if (pos != iter->pos) diff --git a/kernel/kcov.c b/kernel/kcov.c index 85e5546cd791..cd771993f96f 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -60,15 +60,8 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. - * The checks for whether we are in an interrupt are open-coded, because - * 1. We can't use in_interrupt() here, since it also returns true - * when we are inside local_bh_disable() section. - * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), - * since that leads to slower generated code (three separate tests, - * one for each of the flags). */ - if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET - | NMI_MASK))) + if (!t || !in_task()) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5617cc412444..ae1a3ba24df5 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -51,12 +51,6 @@ DEFINE_MUTEX(kexec_mutex); /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; -/* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); - /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; @@ -916,7 +910,7 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ - printk_nmi_flush_on_panic(); + printk_safe_flush_on_panic(); __crash_kexec(regs); /* @@ -996,34 +990,6 @@ unlock: return ret; } -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - void crash_save_cpu(struct pt_regs *regs, int cpu) { struct elf_prstatus prstatus; @@ -1085,403 +1051,6 @@ subsys_initcall(crash_notes_memory_init); /* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - - if (!ck_cmdline) - return NULL; - - return ck_cmdline; -} - -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *name, - const char *suffix) -{ - char *first_colon, *first_space; - char *ck_cmdline; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - - if (!ck_cmdline) - return -EINVAL; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", NULL); -} - -int __init parse_crashkernel_high(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_HIGH]); -} - -int __init parse_crashkernel_low(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_LOW]); -} - -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; - - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); -} - -void crash_save_vmcoreinfo(void) -{ - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); - update_vmcoreinfo_note(); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - size_t r; - - va_start(args, fmt); - r = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_crash_save_vmcoreinfo(void) -{} - -phys_addr_t __weak paddr_vmcoreinfo_note(void) -{ - return __pa((unsigned long)(char *)&vmcoreinfo_note); -} - -static int __init crash_save_vmcoreinfo_init(void) -{ - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_SYMBOL(node_online_map); -#ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); -#endif - VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmap_area_list); - -#ifndef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _refcount); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(page, _mapcount); - VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_dtor); - VMCOREINFO_OFFSET(page, compound_order); - VMCOREINFO_OFFSET(page, compound_head); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vmap_area, va_start); - VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); - log_buf_kexec_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - VMCOREINFO_NUMBER(PG_slab); -#ifdef CONFIG_MEMORY_FAILURE - VMCOREINFO_NUMBER(PG_hwpoison); -#endif - VMCOREINFO_NUMBER(PG_head_mask); - VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); -#endif - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); - - return 0; -} - -subsys_initcall(crash_save_vmcoreinfo_init); - -/* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b56a558e406d..b118735fea9d 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -614,13 +614,13 @@ static int kexec_calculate_store_digests(struct kimage *image) ret = crypto_shash_final(desc, digest); if (ret) goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "sha_regions", - sha_regions, sha_region_sz, 0); + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", + sha_regions, sha_region_sz, 0); if (ret) goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", - digest, SHA256_DIGEST_SIZE, 0); + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); if (ret) goto out_free_digest; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 4cef7e4706b0..799a8a452187 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -15,11 +15,7 @@ int kimage_is_destination_range(struct kimage *image, extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE -struct kexec_sha_region { - unsigned long start; - unsigned long len; -}; - +#include <linux/purgatory.h> void kimage_file_post_load_cleanup(struct kimage *image); #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } diff --git a/kernel/kmod.c b/kernel/kmod.c index d45c96073afb..563f97e2be36 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -20,6 +20,8 @@ */ #include <linux/module.h> #include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/binfmts.h> #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/kmod.h> @@ -516,7 +518,7 @@ static void helper_unlock(void) * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ -struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, +struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, char **envp, gfp_t gfp_mask, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), @@ -528,7 +530,12 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, goto out; INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); + +#ifdef CONFIG_STATIC_USERMODEHELPER + sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; +#else sub_info->path = path; +#endif sub_info->argv = argv; sub_info->envp = envp; @@ -566,6 +573,15 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) retval = -EBUSY; goto out; } + + /* + * If there is no binary for us to call, then just return and get out of + * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and + * disable all call_usermodehelper() calls. + */ + if (strlen(sub_info->path) == 0) + goto out; + /* * Set the completion pointer only if there is a waiter. * This makes it possible to use umh_complete to free @@ -613,7 +629,7 @@ EXPORT_SYMBOL(call_usermodehelper_exec); * This function is the equivalent to use call_usermodehelper_setup() and * call_usermodehelper_exec(). */ -int call_usermodehelper(char *path, char **argv, char **envp, int wait) +int call_usermodehelper(const char *path, char **argv, char **envp, int wait) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 43460104f119..7367e0ec6f81 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -58,15 +58,6 @@ #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) -/* - * Some oddball architectures like 64bit powerpc have function descriptors - * so this must be overridable. - */ -#ifndef kprobe_lookup_name -#define kprobe_lookup_name(name, addr) \ - addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) -#endif - static int kprobes_initialized; static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; @@ -81,6 +72,12 @@ static struct { raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; +kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, + unsigned int __unused) +{ + return ((kprobe_opcode_t *)(kallsyms_lookup_name(name))); +} + static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) { return &(kretprobe_table_locks[hash].lock); @@ -149,9 +146,11 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) struct kprobe_insn_page *kip; kprobe_opcode_t *slot = NULL; + /* Since the slot array is not protected by rcu, we need a mutex */ mutex_lock(&c->mutex); retry: - list_for_each_entry(kip, &c->pages, list) { + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { if (kip->nused < slots_per_page(c)) { int i; for (i = 0; i < slots_per_page(c); i++) { @@ -159,6 +158,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->slot_used[i] = SLOT_USED; kip->nused++; slot = kip->insns + (i * c->insn_size); + rcu_read_unlock(); goto out; } } @@ -167,6 +167,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) WARN_ON(1); } } + rcu_read_unlock(); /* If there are any garbage slots, collect it and try again. */ if (c->nr_garbage && collect_garbage_slots(c) == 0) @@ -193,7 +194,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->nused = 1; kip->ngarbage = 0; kip->cache = c; - list_add(&kip->list, &c->pages); + list_add_rcu(&kip->list, &c->pages); slot = kip->insns; out: mutex_unlock(&c->mutex); @@ -213,7 +214,8 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { - list_del(&kip->list); + list_del_rcu(&kip->list); + synchronize_rcu(); kip->cache->free(kip->insns); kfree(kip); } @@ -235,8 +237,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c) continue; kip->ngarbage = 0; /* we will collect all garbages */ for (i = 0; i < slots_per_page(c); i++) { - if (kip->slot_used[i] == SLOT_DIRTY && - collect_one_slot(kip, i)) + if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i)) break; } } @@ -248,29 +249,60 @@ void __free_insn_slot(struct kprobe_insn_cache *c, kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; + long idx; mutex_lock(&c->mutex); - list_for_each_entry(kip, &c->pages, list) { - long idx = ((long)slot - (long)kip->insns) / - (c->insn_size * sizeof(kprobe_opcode_t)); - if (idx >= 0 && idx < slots_per_page(c)) { - WARN_ON(kip->slot_used[idx] != SLOT_USED); - if (dirty) { - kip->slot_used[idx] = SLOT_DIRTY; - kip->ngarbage++; - if (++c->nr_garbage > slots_per_page(c)) - collect_garbage_slots(c); - } else - collect_one_slot(kip, idx); + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + idx = ((long)slot - (long)kip->insns) / + (c->insn_size * sizeof(kprobe_opcode_t)); + if (idx >= 0 && idx < slots_per_page(c)) goto out; - } } - /* Could not free this slot. */ + /* Could not find this slot. */ WARN_ON(1); + kip = NULL; out: + rcu_read_unlock(); + /* Mark and sweep: this may sleep */ + if (kip) { + /* Check double free */ + WARN_ON(kip->slot_used[idx] != SLOT_USED); + if (dirty) { + kip->slot_used[idx] = SLOT_DIRTY; + kip->ngarbage++; + if (++c->nr_garbage > slots_per_page(c)) + collect_garbage_slots(c); + } else { + collect_one_slot(kip, idx); + } + } mutex_unlock(&c->mutex); } +/* + * Check given address is on the page of kprobe instruction slots. + * This will be used for checking whether the address on a stack + * is on a text area or not. + */ +bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr) +{ + struct kprobe_insn_page *kip; + bool ret = false; + + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if (addr >= (unsigned long)kip->insns && + addr < (unsigned long)kip->insns + PAGE_SIZE) { + ret = true; + break; + } + } + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { @@ -711,13 +743,20 @@ static void kill_optimized_kprobe(struct kprobe *p) arch_remove_optimized_kprobe(op); } +static inline +void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) +{ + if (!kprobe_ftrace(p)) + arch_prepare_optimized_kprobe(op, p); +} + /* Try to prepare optimized instructions */ static void prepare_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; op = container_of(p, struct optimized_kprobe, kp); - arch_prepare_optimized_kprobe(op, p); + __prepare_optimized_kprobe(op, p); } /* Allocate new optimized_kprobe and try to prepare optimized instructions */ @@ -731,7 +770,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) INIT_LIST_HEAD(&op->list); op->kp.addr = p->addr; - arch_prepare_optimized_kprobe(op, p); + __prepare_optimized_kprobe(op, p); return &op->kp; } @@ -1356,21 +1395,19 @@ bool within_kprobe_blacklist(unsigned long addr) * This returns encoded errors if it fails to look up symbol or invalid * combination of parameters. */ -static kprobe_opcode_t *kprobe_addr(struct kprobe *p) +static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr, + const char *symbol_name, unsigned int offset) { - kprobe_opcode_t *addr = p->addr; - - if ((p->symbol_name && p->addr) || - (!p->symbol_name && !p->addr)) + if ((symbol_name && addr) || (!symbol_name && !addr)) goto invalid; - if (p->symbol_name) { - kprobe_lookup_name(p->symbol_name, addr); + if (symbol_name) { + addr = kprobe_lookup_name(symbol_name, offset); if (!addr) return ERR_PTR(-ENOENT); } - addr = (kprobe_opcode_t *)(((char *)addr) + p->offset); + addr = (kprobe_opcode_t *)(((char *)addr) + offset); if (addr) return addr; @@ -1378,6 +1415,11 @@ invalid: return ERR_PTR(-EINVAL); } +static kprobe_opcode_t *kprobe_addr(struct kprobe *p) +{ + return _kprobe_addr(p->addr, p->symbol_name, p->offset); +} + /* Check passed kprobe is valid and return kprobe in kprobe_table. */ static struct kprobe *__get_valid_kprobe(struct kprobe *p) { @@ -1705,6 +1747,13 @@ void unregister_kprobes(struct kprobe **kps, int num) } EXPORT_SYMBOL_GPL(unregister_kprobes); +int __weak kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return NOTIFY_DONE; +} +NOKPROBE_SYMBOL(kprobe_exceptions_notify); + static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, .priority = 0x7fffffff /* we need to be notified first */ @@ -1834,6 +1883,25 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); +bool __weak arch_function_offset_within_entry(unsigned long offset) +{ + return !offset; +} + +bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) +{ + kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); + + if (IS_ERR(kp_addr)) + return false; + + if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) || + !arch_function_offset_within_entry(offset)) + return false; + + return true; +} + int register_kretprobe(struct kretprobe *rp) { int ret = 0; @@ -1841,6 +1909,9 @@ int register_kretprobe(struct kretprobe *rp) int i; void *addr; + if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) + return -EINVAL; + if (kretprobe_blacklist_size) { addr = kprobe_addr(&rp->kp); if (IS_ERR(addr)) @@ -2151,8 +2222,8 @@ static int __init init_kprobes(void) if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { - kprobe_lookup_name(kretprobe_blacklist[i].name, - kretprobe_blacklist[i].addr); + kretprobe_blacklist[i].addr = + kprobe_lookup_name(kretprobe_blacklist[i].name, 0); if (!kretprobe_blacklist[i].addr) printk("kretprobe: lookup failed: %s\n", kretprobe_blacklist[i].name); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index ee1bc1bb8feb..23cd70651238 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -125,6 +125,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, } KERNEL_ATTR_RW(kexec_crash_size); +#endif /* CONFIG_KEXEC_CORE */ + +#ifdef CONFIG_CRASH_CORE + static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -134,7 +138,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_CRASH_CORE */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, @@ -195,7 +199,7 @@ static ssize_t notes_read(struct file *filp, struct kobject *kobj, return count; } -static struct bin_attribute notes_attr = { +static struct bin_attribute notes_attr __ro_after_init = { .attr = { .name = "notes", .mode = S_IRUGO, @@ -219,6 +223,8 @@ static struct attribute * kernel_attrs[] = { &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, +#endif +#ifdef CONFIG_CRASH_CORE &vmcoreinfo_attr.attr, #endif #ifndef CONFIG_TINY_RCU diff --git a/kernel/kthread.c b/kernel/kthread.c index 2318fba86277..26db528c1d88 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -5,7 +5,9 @@ * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ +#include <uapi/linux/sched/types.h> #include <linux/sched.h> +#include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> @@ -18,6 +20,7 @@ #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> +#include <linux/cgroup.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -223,6 +226,7 @@ static int kthread(void *_create) ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { + cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } @@ -536,6 +540,7 @@ int kthreadd(void *unused) set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; + cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -850,7 +855,6 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker, list_add(&work->node, &worker->delayed_work_list); work->worker = worker; - timer_stats_timer_set_start_info(&dwork->timer); timer->expires = jiffies + delay; add_timer(timer); } diff --git a/kernel/latencytop.c b/kernel/latencytop.c index b5c30d9f46c5..96b4179cee6a 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -55,6 +55,8 @@ #include <linux/latencytop.h> #include <linux/export.h> #include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/stat.h> #include <linux/list.h> #include <linux/stacktrace.h> diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index e8780c0901d9..2b8bdb1925da 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o -livepatch-objs := core.o +livepatch-objs := core.o patch.o transition.o diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index af4643873e71..b9628e43c78f 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -24,61 +24,31 @@ #include <linux/kernel.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/ftrace.h> #include <linux/list.h> #include <linux/kallsyms.h> #include <linux/livepatch.h> #include <linux/elf.h> #include <linux/moduleloader.h> +#include <linux/completion.h> #include <asm/cacheflush.h> - -/** - * struct klp_ops - structure for tracking registered ftrace ops structs - * - * A single ftrace_ops is shared between all enabled replacement functions - * (klp_func structs) which have the same old_addr. This allows the switch - * between function versions to happen instantaneously by updating the klp_ops - * struct's func_stack list. The winner is the klp_func at the top of the - * func_stack (front of the list). - * - * @node: node for the global klp_ops list - * @func_stack: list head for the stack of klp_func's (active func is on top) - * @fops: registered ftrace ops struct - */ -struct klp_ops { - struct list_head node; - struct list_head func_stack; - struct ftrace_ops fops; -}; +#include "core.h" +#include "patch.h" +#include "transition.h" /* - * The klp_mutex protects the global lists and state transitions of any - * structure reachable from them. References to any structure must be obtained - * under mutex protection (except in klp_ftrace_handler(), which uses RCU to - * ensure it gets consistent data). + * klp_mutex is a coarse lock which serializes access to klp data. All + * accesses to klp-related variables and structures must have mutex protection, + * except within the following functions which carefully avoid the need for it: + * + * - klp_ftrace_handler() + * - klp_update_patch_state() */ -static DEFINE_MUTEX(klp_mutex); +DEFINE_MUTEX(klp_mutex); static LIST_HEAD(klp_patches); -static LIST_HEAD(klp_ops); static struct kobject *klp_root_kobj; -static struct klp_ops *klp_find_ops(unsigned long old_addr) -{ - struct klp_ops *ops; - struct klp_func *func; - - list_for_each_entry(ops, &klp_ops, node) { - func = list_first_entry(&ops->func_stack, struct klp_func, - stack_node); - if (func->old_addr == old_addr) - return ops; - } - - return NULL; -} - static bool klp_is_module(struct klp_object *obj) { return obj->name; @@ -117,7 +87,6 @@ static void klp_find_object_module(struct klp_object *obj) mutex_unlock(&module_mutex); } -/* klp_mutex must be held by caller */ static bool klp_is_patch_registered(struct klp_patch *patch) { struct klp_patch *mypatch; @@ -182,7 +151,10 @@ static int klp_find_object_symbol(const char *objname, const char *name, }; mutex_lock(&module_mutex); - kallsyms_on_each_symbol(klp_find_callback, &args); + if (objname) + module_kallsyms_on_each_symbol(klp_find_callback, &args); + else + kallsyms_on_each_symbol(klp_find_callback, &args); mutex_unlock(&module_mutex); /* @@ -233,7 +205,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { - pr_err("symbol %s is not marked as a livepatch symbol", + pr_err("symbol %s is not marked as a livepatch symbol\n", strtab + sym->st_name); return -EINVAL; } @@ -243,7 +215,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) ".klp.sym.%55[^.].%127[^,],%lu", objname, symname, &sympos); if (cnt != 3) { - pr_err("symbol %s has an incorrectly formatted name", + pr_err("symbol %s has an incorrectly formatted name\n", strtab + sym->st_name); return -EINVAL; } @@ -288,7 +260,7 @@ static int klp_write_object_relocations(struct module *pmod, */ cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); if (cnt != 1) { - pr_err("section %s has an incorrectly formatted name", + pr_err("section %s has an incorrectly formatted name\n", secname); ret = -EINVAL; break; @@ -311,191 +283,30 @@ static int klp_write_object_relocations(struct module *pmod, return ret; } -static void notrace klp_ftrace_handler(unsigned long ip, - unsigned long parent_ip, - struct ftrace_ops *fops, - struct pt_regs *regs) -{ - struct klp_ops *ops; - struct klp_func *func; - - ops = container_of(fops, struct klp_ops, fops); - - rcu_read_lock(); - func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, - stack_node); - if (WARN_ON_ONCE(!func)) - goto unlock; - - klp_arch_set_pc(regs, (unsigned long)func->new_func); -unlock: - rcu_read_unlock(); -} - -/* - * Convert a function address into the appropriate ftrace location. - * - * Usually this is just the address of the function, but on some architectures - * it's more complicated so allow them to provide a custom behaviour. - */ -#ifndef klp_get_ftrace_location -static unsigned long klp_get_ftrace_location(unsigned long faddr) -{ - return faddr; -} -#endif - -static void klp_disable_func(struct klp_func *func) -{ - struct klp_ops *ops; - - if (WARN_ON(func->state != KLP_ENABLED)) - return; - if (WARN_ON(!func->old_addr)) - return; - - ops = klp_find_ops(func->old_addr); - if (WARN_ON(!ops)) - return; - - if (list_is_singular(&ops->func_stack)) { - unsigned long ftrace_loc; - - ftrace_loc = klp_get_ftrace_location(func->old_addr); - if (WARN_ON(!ftrace_loc)) - return; - - WARN_ON(unregister_ftrace_function(&ops->fops)); - WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0)); - - list_del_rcu(&func->stack_node); - list_del(&ops->node); - kfree(ops); - } else { - list_del_rcu(&func->stack_node); - } - - func->state = KLP_DISABLED; -} - -static int klp_enable_func(struct klp_func *func) -{ - struct klp_ops *ops; - int ret; - - if (WARN_ON(!func->old_addr)) - return -EINVAL; - - if (WARN_ON(func->state != KLP_DISABLED)) - return -EINVAL; - - ops = klp_find_ops(func->old_addr); - if (!ops) { - unsigned long ftrace_loc; - - ftrace_loc = klp_get_ftrace_location(func->old_addr); - if (!ftrace_loc) { - pr_err("failed to find location for function '%s'\n", - func->old_name); - return -EINVAL; - } - - ops = kzalloc(sizeof(*ops), GFP_KERNEL); - if (!ops) - return -ENOMEM; - - ops->fops.func = klp_ftrace_handler; - ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | - FTRACE_OPS_FL_DYNAMIC | - FTRACE_OPS_FL_IPMODIFY; - - list_add(&ops->node, &klp_ops); - - INIT_LIST_HEAD(&ops->func_stack); - list_add_rcu(&func->stack_node, &ops->func_stack); - - ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0); - if (ret) { - pr_err("failed to set ftrace filter for function '%s' (%d)\n", - func->old_name, ret); - goto err; - } - - ret = register_ftrace_function(&ops->fops); - if (ret) { - pr_err("failed to register ftrace handler for function '%s' (%d)\n", - func->old_name, ret); - ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0); - goto err; - } - - - } else { - list_add_rcu(&func->stack_node, &ops->func_stack); - } - - func->state = KLP_ENABLED; - - return 0; - -err: - list_del_rcu(&func->stack_node); - list_del(&ops->node); - kfree(ops); - return ret; -} - -static void klp_disable_object(struct klp_object *obj) -{ - struct klp_func *func; - - klp_for_each_func(obj, func) - if (func->state == KLP_ENABLED) - klp_disable_func(func); - - obj->state = KLP_DISABLED; -} - -static int klp_enable_object(struct klp_object *obj) -{ - struct klp_func *func; - int ret; - - if (WARN_ON(obj->state != KLP_DISABLED)) - return -EINVAL; - - if (WARN_ON(!klp_is_object_loaded(obj))) - return -EINVAL; - - klp_for_each_func(obj, func) { - ret = klp_enable_func(func); - if (ret) { - klp_disable_object(obj); - return ret; - } - } - obj->state = KLP_ENABLED; - - return 0; -} - static int __klp_disable_patch(struct klp_patch *patch) { - struct klp_object *obj; + if (klp_transition_patch) + return -EBUSY; /* enforce stacking: only the last enabled patch can be disabled */ if (!list_is_last(&patch->list, &klp_patches) && - list_next_entry(patch, list)->state == KLP_ENABLED) + list_next_entry(patch, list)->enabled) return -EBUSY; - pr_notice("disabling patch '%s'\n", patch->mod->name); + klp_init_transition(patch, KLP_UNPATCHED); - klp_for_each_object(patch, obj) { - if (obj->state == KLP_ENABLED) - klp_disable_object(obj); - } + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the TIF_PATCH_PENDING writes in + * klp_start_transition(). In the rare case where klp_ftrace_handler() + * is called shortly after klp_update_patch_state() switches the task, + * this ensures the handler sees that func->transition is set. + */ + smp_wmb(); - patch->state = KLP_DISABLED; + klp_start_transition(); + klp_try_complete_transition(); + patch->enabled = false; return 0; } @@ -519,7 +330,7 @@ int klp_disable_patch(struct klp_patch *patch) goto err; } - if (patch->state == KLP_DISABLED) { + if (!patch->enabled) { ret = -EINVAL; goto err; } @@ -537,32 +348,61 @@ static int __klp_enable_patch(struct klp_patch *patch) struct klp_object *obj; int ret; - if (WARN_ON(patch->state != KLP_DISABLED)) + if (klp_transition_patch) + return -EBUSY; + + if (WARN_ON(patch->enabled)) return -EINVAL; /* enforce stacking: only the first disabled patch can be enabled */ if (patch->list.prev != &klp_patches && - list_prev_entry(patch, list)->state == KLP_DISABLED) + !list_prev_entry(patch, list)->enabled) return -EBUSY; + /* + * A reference is taken on the patch module to prevent it from being + * unloaded. + * + * Note: For immediate (no consistency model) patches we don't allow + * patch modules to unload since there is no safe/sane method to + * determine if a thread is still running in the patched code contained + * in the patch module once the ftrace registration is successful. + */ + if (!try_module_get(patch->mod)) + return -ENODEV; + pr_notice("enabling patch '%s'\n", patch->mod->name); + klp_init_transition(patch, KLP_PATCHED); + + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the ops->func_stack writes in + * klp_patch_object(), so that klp_ftrace_handler() will see the + * func->transition updates before the handler is registered and the + * new funcs become visible to the handler. + */ + smp_wmb(); + klp_for_each_object(patch, obj) { if (!klp_is_object_loaded(obj)) continue; - ret = klp_enable_object(obj); - if (ret) - goto unregister; + ret = klp_patch_object(obj); + if (ret) { + pr_warn("failed to enable patch '%s'\n", + patch->mod->name); + + klp_cancel_transition(); + return ret; + } } - patch->state = KLP_ENABLED; + klp_start_transition(); + klp_try_complete_transition(); + patch->enabled = true; return 0; - -unregister: - WARN_ON(__klp_disable_patch(patch)); - return ret; } /** @@ -599,6 +439,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch * /sys/kernel/livepatch/<patch> * /sys/kernel/livepatch/<patch>/enabled + * /sys/kernel/livepatch/<patch>/transition * /sys/kernel/livepatch/<patch>/<object> * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> */ @@ -608,26 +449,34 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, { struct klp_patch *patch; int ret; - unsigned long val; + bool enabled; - ret = kstrtoul(buf, 10, &val); + ret = kstrtobool(buf, &enabled); if (ret) - return -EINVAL; - - if (val != KLP_DISABLED && val != KLP_ENABLED) - return -EINVAL; + return ret; patch = container_of(kobj, struct klp_patch, kobj); mutex_lock(&klp_mutex); - if (val == patch->state) { + if (!klp_is_patch_registered(patch)) { + /* + * Module with the patch could either disappear meanwhile or is + * not properly initialized yet. + */ + ret = -EINVAL; + goto err; + } + + if (patch->enabled == enabled) { /* already in requested state */ ret = -EINVAL; goto err; } - if (val == KLP_ENABLED) { + if (patch == klp_transition_patch) { + klp_reverse_transition(); + } else if (enabled) { ret = __klp_enable_patch(patch); if (ret) goto err; @@ -652,21 +501,33 @@ static ssize_t enabled_show(struct kobject *kobj, struct klp_patch *patch; patch = container_of(kobj, struct klp_patch, kobj); - return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state); + return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->enabled); +} + +static ssize_t transition_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct klp_patch *patch; + + patch = container_of(kobj, struct klp_patch, kobj); + return snprintf(buf, PAGE_SIZE-1, "%d\n", + patch == klp_transition_patch); } static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); +static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, + &transition_kobj_attr.attr, NULL }; static void klp_kobj_release_patch(struct kobject *kobj) { - /* - * Once we have a consistency model we'll need to module_put() the - * patch module here. See klp_register_patch() for more details. - */ + struct klp_patch *patch; + + patch = container_of(kobj, struct klp_patch, kobj); + complete(&patch->finish); } static struct kobj_type klp_ktype_patch = { @@ -737,7 +598,6 @@ static void klp_free_patch(struct klp_patch *patch) klp_free_objects_limited(patch, NULL); if (!list_empty(&patch->list)) list_del(&patch->list); - kobject_put(&patch->kobj); } static int klp_init_func(struct klp_object *obj, struct klp_func *func) @@ -746,7 +606,8 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) return -EINVAL; INIT_LIST_HEAD(&func->stack_node); - func->state = KLP_DISABLED; + func->patched = false; + func->transition = false; /* The format for the sysfs directory is <function,sympos> where sympos * is the nth occurrence of this symbol in kallsyms for the patched @@ -787,6 +648,22 @@ static int klp_init_object_loaded(struct klp_patch *patch, &func->old_addr); if (ret) return ret; + + ret = kallsyms_lookup_size_offset(func->old_addr, + &func->old_size, NULL); + if (!ret) { + pr_err("kallsyms size lookup failed for '%s'\n", + func->old_name); + return -ENOENT; + } + + ret = kallsyms_lookup_size_offset((unsigned long)func->new_func, + &func->new_size, NULL); + if (!ret) { + pr_err("kallsyms size lookup failed for '%s' replacement\n", + func->old_name); + return -ENOENT; + } } return 0; @@ -801,7 +678,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) if (!obj->funcs) return -EINVAL; - obj->state = KLP_DISABLED; + obj->patched = false; obj->mod = NULL; klp_find_object_module(obj); @@ -842,12 +719,15 @@ static int klp_init_patch(struct klp_patch *patch) mutex_lock(&klp_mutex); - patch->state = KLP_DISABLED; + patch->enabled = false; + init_completion(&patch->finish); ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, klp_root_kobj, "%s", patch->mod->name); - if (ret) - goto unlock; + if (ret) { + mutex_unlock(&klp_mutex); + return ret; + } klp_for_each_object(patch, obj) { ret = klp_init_object(patch, obj); @@ -863,9 +743,12 @@ static int klp_init_patch(struct klp_patch *patch) free: klp_free_objects_limited(patch, obj); - kobject_put(&patch->kobj); -unlock: + mutex_unlock(&klp_mutex); + + kobject_put(&patch->kobj); + wait_for_completion(&patch->finish); + return ret; } @@ -879,23 +762,29 @@ unlock: */ int klp_unregister_patch(struct klp_patch *patch) { - int ret = 0; + int ret; mutex_lock(&klp_mutex); if (!klp_is_patch_registered(patch)) { ret = -EINVAL; - goto out; + goto err; } - if (patch->state == KLP_ENABLED) { + if (patch->enabled) { ret = -EBUSY; - goto out; + goto err; } klp_free_patch(patch); -out: + mutex_unlock(&klp_mutex); + + kobject_put(&patch->kobj); + wait_for_completion(&patch->finish); + + return 0; +err: mutex_unlock(&klp_mutex); return ret; } @@ -908,17 +797,18 @@ EXPORT_SYMBOL_GPL(klp_unregister_patch); * Initializes the data structure associated with the patch and * creates the sysfs interface. * + * There is no need to take the reference on the patch module here. It is done + * later when the patch is enabled. + * * Return: 0 on success, otherwise error */ int klp_register_patch(struct klp_patch *patch) { - int ret; - if (!patch || !patch->mod) return -EINVAL; if (!is_livepatch_module(patch->mod)) { - pr_err("module %s is not marked as a livepatch module", + pr_err("module %s is not marked as a livepatch module\n", patch->mod->name); return -EINVAL; } @@ -927,20 +817,16 @@ int klp_register_patch(struct klp_patch *patch) return -ENODEV; /* - * A reference is taken on the patch module to prevent it from being - * unloaded. Right now, we don't allow patch modules to unload since - * there is currently no method to determine if a thread is still - * running in the patched code contained in the patch module once - * the ftrace registration is successful. + * Architectures without reliable stack traces have to set + * patch->immediate because there's currently no way to patch kthreads + * with the consistency model. */ - if (!try_module_get(patch->mod)) - return -ENODEV; - - ret = klp_init_patch(patch); - if (ret) - module_put(patch->mod); + if (!klp_have_reliable_stack() && !patch->immediate) { + pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); + return -ENOSYS; + } - return ret; + return klp_init_patch(patch); } EXPORT_SYMBOL_GPL(klp_register_patch); @@ -975,13 +861,17 @@ int klp_module_coming(struct module *mod) goto err; } - if (patch->state == KLP_DISABLED) + /* + * Only patch the module if the patch is enabled or is + * in transition. + */ + if (!patch->enabled && patch != klp_transition_patch) break; pr_notice("applying patch '%s' to loading module '%s'\n", patch->mod->name, obj->mod->name); - ret = klp_enable_object(obj); + ret = klp_patch_object(obj); if (ret) { pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", patch->mod->name, obj->mod->name, ret); @@ -1032,10 +922,14 @@ void klp_module_going(struct module *mod) if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; - if (patch->state != KLP_DISABLED) { + /* + * Only unpatch the module if the patch is enabled or + * is in transition. + */ + if (patch->enabled || patch == klp_transition_patch) { pr_notice("reverting patch '%s' on unloading module '%s'\n", patch->mod->name, obj->mod->name); - klp_disable_object(obj); + klp_unpatch_object(obj); } klp_free_object_loaded(obj); diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h new file mode 100644 index 000000000000..c74f24c47837 --- /dev/null +++ b/kernel/livepatch/core.h @@ -0,0 +1,6 @@ +#ifndef _LIVEPATCH_CORE_H +#define _LIVEPATCH_CORE_H + +extern struct mutex klp_mutex; + +#endif /* _LIVEPATCH_CORE_H */ diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c new file mode 100644 index 000000000000..f8269036bf0b --- /dev/null +++ b/kernel/livepatch/patch.c @@ -0,0 +1,272 @@ +/* + * patch.c - livepatch patching functions + * + * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> + * Copyright (C) 2014 SUSE + * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/livepatch.h> +#include <linux/list.h> +#include <linux/ftrace.h> +#include <linux/rculist.h> +#include <linux/slab.h> +#include <linux/bug.h> +#include <linux/printk.h> +#include "patch.h" +#include "transition.h" + +static LIST_HEAD(klp_ops); + +struct klp_ops *klp_find_ops(unsigned long old_addr) +{ + struct klp_ops *ops; + struct klp_func *func; + + list_for_each_entry(ops, &klp_ops, node) { + func = list_first_entry(&ops->func_stack, struct klp_func, + stack_node); + if (func->old_addr == old_addr) + return ops; + } + + return NULL; +} + +static void notrace klp_ftrace_handler(unsigned long ip, + unsigned long parent_ip, + struct ftrace_ops *fops, + struct pt_regs *regs) +{ + struct klp_ops *ops; + struct klp_func *func; + int patch_state; + + ops = container_of(fops, struct klp_ops, fops); + + rcu_read_lock(); + + func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, + stack_node); + + /* + * func should never be NULL because preemption should be disabled here + * and unregister_ftrace_function() does the equivalent of a + * synchronize_sched() before the func_stack removal. + */ + if (WARN_ON_ONCE(!func)) + goto unlock; + + /* + * In the enable path, enforce the order of the ops->func_stack and + * func->transition reads. The corresponding write barrier is in + * __klp_enable_patch(). + * + * (Note that this barrier technically isn't needed in the disable + * path. In the rare case where klp_update_patch_state() runs before + * this handler, its TIF_PATCH_PENDING read and this func->transition + * read need to be ordered. But klp_update_patch_state() already + * enforces that.) + */ + smp_rmb(); + + if (unlikely(func->transition)) { + + /* + * Enforce the order of the func->transition and + * current->patch_state reads. Otherwise we could read an + * out-of-date task state and pick the wrong function. The + * corresponding write barrier is in klp_init_transition(). + */ + smp_rmb(); + + patch_state = current->patch_state; + + WARN_ON_ONCE(patch_state == KLP_UNDEFINED); + + if (patch_state == KLP_UNPATCHED) { + /* + * Use the previously patched version of the function. + * If no previous patches exist, continue with the + * original function. + */ + func = list_entry_rcu(func->stack_node.next, + struct klp_func, stack_node); + + if (&func->stack_node == &ops->func_stack) + goto unlock; + } + } + + klp_arch_set_pc(regs, (unsigned long)func->new_func); +unlock: + rcu_read_unlock(); +} + +/* + * Convert a function address into the appropriate ftrace location. + * + * Usually this is just the address of the function, but on some architectures + * it's more complicated so allow them to provide a custom behaviour. + */ +#ifndef klp_get_ftrace_location +static unsigned long klp_get_ftrace_location(unsigned long faddr) +{ + return faddr; +} +#endif + +static void klp_unpatch_func(struct klp_func *func) +{ + struct klp_ops *ops; + + if (WARN_ON(!func->patched)) + return; + if (WARN_ON(!func->old_addr)) + return; + + ops = klp_find_ops(func->old_addr); + if (WARN_ON(!ops)) + return; + + if (list_is_singular(&ops->func_stack)) { + unsigned long ftrace_loc; + + ftrace_loc = klp_get_ftrace_location(func->old_addr); + if (WARN_ON(!ftrace_loc)) + return; + + WARN_ON(unregister_ftrace_function(&ops->fops)); + WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0)); + + list_del_rcu(&func->stack_node); + list_del(&ops->node); + kfree(ops); + } else { + list_del_rcu(&func->stack_node); + } + + func->patched = false; +} + +static int klp_patch_func(struct klp_func *func) +{ + struct klp_ops *ops; + int ret; + + if (WARN_ON(!func->old_addr)) + return -EINVAL; + + if (WARN_ON(func->patched)) + return -EINVAL; + + ops = klp_find_ops(func->old_addr); + if (!ops) { + unsigned long ftrace_loc; + + ftrace_loc = klp_get_ftrace_location(func->old_addr); + if (!ftrace_loc) { + pr_err("failed to find location for function '%s'\n", + func->old_name); + return -EINVAL; + } + + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + ops->fops.func = klp_ftrace_handler; + ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | + FTRACE_OPS_FL_DYNAMIC | + FTRACE_OPS_FL_IPMODIFY; + + list_add(&ops->node, &klp_ops); + + INIT_LIST_HEAD(&ops->func_stack); + list_add_rcu(&func->stack_node, &ops->func_stack); + + ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0); + if (ret) { + pr_err("failed to set ftrace filter for function '%s' (%d)\n", + func->old_name, ret); + goto err; + } + + ret = register_ftrace_function(&ops->fops); + if (ret) { + pr_err("failed to register ftrace handler for function '%s' (%d)\n", + func->old_name, ret); + ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0); + goto err; + } + + + } else { + list_add_rcu(&func->stack_node, &ops->func_stack); + } + + func->patched = true; + + return 0; + +err: + list_del_rcu(&func->stack_node); + list_del(&ops->node); + kfree(ops); + return ret; +} + +void klp_unpatch_object(struct klp_object *obj) +{ + struct klp_func *func; + + klp_for_each_func(obj, func) + if (func->patched) + klp_unpatch_func(func); + + obj->patched = false; +} + +int klp_patch_object(struct klp_object *obj) +{ + struct klp_func *func; + int ret; + + if (WARN_ON(obj->patched)) + return -EINVAL; + + klp_for_each_func(obj, func) { + ret = klp_patch_func(func); + if (ret) { + klp_unpatch_object(obj); + return ret; + } + } + obj->patched = true; + + return 0; +} + +void klp_unpatch_objects(struct klp_patch *patch) +{ + struct klp_object *obj; + + klp_for_each_object(patch, obj) + if (obj->patched) + klp_unpatch_object(obj); +} diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h new file mode 100644 index 000000000000..0db227170c36 --- /dev/null +++ b/kernel/livepatch/patch.h @@ -0,0 +1,33 @@ +#ifndef _LIVEPATCH_PATCH_H +#define _LIVEPATCH_PATCH_H + +#include <linux/livepatch.h> +#include <linux/list.h> +#include <linux/ftrace.h> + +/** + * struct klp_ops - structure for tracking registered ftrace ops structs + * + * A single ftrace_ops is shared between all enabled replacement functions + * (klp_func structs) which have the same old_addr. This allows the switch + * between function versions to happen instantaneously by updating the klp_ops + * struct's func_stack list. The winner is the klp_func at the top of the + * func_stack (front of the list). + * + * @node: node for the global klp_ops list + * @func_stack: list head for the stack of klp_func's (active func is on top) + * @fops: registered ftrace ops struct + */ +struct klp_ops { + struct list_head node; + struct list_head func_stack; + struct ftrace_ops fops; +}; + +struct klp_ops *klp_find_ops(unsigned long old_addr); + +int klp_patch_object(struct klp_object *obj); +void klp_unpatch_object(struct klp_object *obj); +void klp_unpatch_objects(struct klp_patch *patch); + +#endif /* _LIVEPATCH_PATCH_H */ diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c new file mode 100644 index 000000000000..adc0cc64aa4b --- /dev/null +++ b/kernel/livepatch/transition.c @@ -0,0 +1,553 @@ +/* + * transition.c - Kernel Live Patching transition functions + * + * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> +#include <linux/stacktrace.h> +#include "core.h" +#include "patch.h" +#include "transition.h" +#include "../sched/sched.h" + +#define MAX_STACK_ENTRIES 100 +#define STACK_ERR_BUF_SIZE 128 + +struct klp_patch *klp_transition_patch; + +static int klp_target_state = KLP_UNDEFINED; + +/* + * This work can be performed periodically to finish patching or unpatching any + * "straggler" tasks which failed to transition in the first attempt. + */ +static void klp_transition_work_fn(struct work_struct *work) +{ + mutex_lock(&klp_mutex); + + if (klp_transition_patch) + klp_try_complete_transition(); + + mutex_unlock(&klp_mutex); +} +static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn); + +/* + * The transition to the target patch state is complete. Clean up the data + * structures. + */ +static void klp_complete_transition(void) +{ + struct klp_object *obj; + struct klp_func *func; + struct task_struct *g, *task; + unsigned int cpu; + bool immediate_func = false; + + if (klp_target_state == KLP_UNPATCHED) { + /* + * All tasks have transitioned to KLP_UNPATCHED so we can now + * remove the new functions from the func_stack. + */ + klp_unpatch_objects(klp_transition_patch); + + /* + * Make sure klp_ftrace_handler() can no longer see functions + * from this patch on the ops->func_stack. Otherwise, after + * func->transition gets cleared, the handler may choose a + * removed function. + */ + synchronize_rcu(); + } + + if (klp_transition_patch->immediate) + goto done; + + klp_for_each_object(klp_transition_patch, obj) { + klp_for_each_func(obj, func) { + func->transition = false; + if (func->immediate) + immediate_func = true; + } + } + + if (klp_target_state == KLP_UNPATCHED && !immediate_func) + module_put(klp_transition_patch->mod); + + /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ + if (klp_target_state == KLP_PATCHED) + synchronize_rcu(); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) { + WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); + task->patch_state = KLP_UNDEFINED; + } + read_unlock(&tasklist_lock); + + for_each_possible_cpu(cpu) { + task = idle_task(cpu); + WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); + task->patch_state = KLP_UNDEFINED; + } + +done: + klp_target_state = KLP_UNDEFINED; + klp_transition_patch = NULL; +} + +/* + * This is called in the error path, to cancel a transition before it has + * started, i.e. klp_init_transition() has been called but + * klp_start_transition() hasn't. If the transition *has* been started, + * klp_reverse_transition() should be used instead. + */ +void klp_cancel_transition(void) +{ + if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED)) + return; + + klp_target_state = KLP_UNPATCHED; + klp_complete_transition(); +} + +/* + * Switch the patched state of the task to the set of functions in the target + * patch state. + * + * NOTE: If task is not 'current', the caller must ensure the task is inactive. + * Otherwise klp_ftrace_handler() might read the wrong 'patch_state' value. + */ +void klp_update_patch_state(struct task_struct *task) +{ + rcu_read_lock(); + + /* + * This test_and_clear_tsk_thread_flag() call also serves as a read + * barrier (smp_rmb) for two cases: + * + * 1) Enforce the order of the TIF_PATCH_PENDING read and the + * klp_target_state read. The corresponding write barrier is in + * klp_init_transition(). + * + * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read + * of func->transition, if klp_ftrace_handler() is called later on + * the same CPU. See __klp_disable_patch(). + */ + if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING)) + task->patch_state = READ_ONCE(klp_target_state); + + rcu_read_unlock(); +} + +/* + * Determine whether the given stack trace includes any references to a + * to-be-patched or to-be-unpatched function. + */ +static int klp_check_stack_func(struct klp_func *func, + struct stack_trace *trace) +{ + unsigned long func_addr, func_size, address; + struct klp_ops *ops; + int i; + + if (func->immediate) + return 0; + + for (i = 0; i < trace->nr_entries; i++) { + address = trace->entries[i]; + + if (klp_target_state == KLP_UNPATCHED) { + /* + * Check for the to-be-unpatched function + * (the func itself). + */ + func_addr = (unsigned long)func->new_func; + func_size = func->new_size; + } else { + /* + * Check for the to-be-patched function + * (the previous func). + */ + ops = klp_find_ops(func->old_addr); + + if (list_is_singular(&ops->func_stack)) { + /* original function */ + func_addr = func->old_addr; + func_size = func->old_size; + } else { + /* previously patched function */ + struct klp_func *prev; + + prev = list_next_entry(func, stack_node); + func_addr = (unsigned long)prev->new_func; + func_size = prev->new_size; + } + } + + if (address >= func_addr && address < func_addr + func_size) + return -EAGAIN; + } + + return 0; +} + +/* + * Determine whether it's safe to transition the task to the target patch state + * by looking for any to-be-patched or to-be-unpatched functions on its stack. + */ +static int klp_check_stack(struct task_struct *task, char *err_buf) +{ + static unsigned long entries[MAX_STACK_ENTRIES]; + struct stack_trace trace; + struct klp_object *obj; + struct klp_func *func; + int ret; + + trace.skip = 0; + trace.nr_entries = 0; + trace.max_entries = MAX_STACK_ENTRIES; + trace.entries = entries; + ret = save_stack_trace_tsk_reliable(task, &trace); + WARN_ON_ONCE(ret == -ENOSYS); + if (ret) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d has an unreliable stack\n", + __func__, task->comm, task->pid); + return ret; + } + + klp_for_each_object(klp_transition_patch, obj) { + if (!obj->patched) + continue; + klp_for_each_func(obj, func) { + ret = klp_check_stack_func(func, &trace); + if (ret) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is sleeping on function %s\n", + __func__, task->comm, task->pid, + func->old_name); + return ret; + } + } + } + + return 0; +} + +/* + * Try to safely switch a task to the target patch state. If it's currently + * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or + * if the stack is unreliable, return false. + */ +static bool klp_try_switch_task(struct task_struct *task) +{ + struct rq *rq; + struct rq_flags flags; + int ret; + bool success = false; + char err_buf[STACK_ERR_BUF_SIZE]; + + err_buf[0] = '\0'; + + /* check if this task has already switched over */ + if (task->patch_state == klp_target_state) + return true; + + /* + * For arches which don't have reliable stack traces, we have to rely + * on other methods (e.g., switching tasks at kernel exit). + */ + if (!klp_have_reliable_stack()) + return false; + + /* + * Now try to check the stack for any to-be-patched or to-be-unpatched + * functions. If all goes well, switch the task to the target patch + * state. + */ + rq = task_rq_lock(task, &flags); + + if (task_running(rq, task) && task != current) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); + goto done; + } + + ret = klp_check_stack(task, err_buf); + if (ret) + goto done; + + success = true; + + clear_tsk_thread_flag(task, TIF_PATCH_PENDING); + task->patch_state = klp_target_state; + +done: + task_rq_unlock(rq, task, &flags); + + /* + * Due to console deadlock issues, pr_debug() can't be used while + * holding the task rq lock. Instead we have to use a temporary buffer + * and print the debug message after releasing the lock. + */ + if (err_buf[0] != '\0') + pr_debug("%s", err_buf); + + return success; + +} + +/* + * Try to switch all remaining tasks to the target patch state by walking the + * stacks of sleeping tasks and looking for any to-be-patched or + * to-be-unpatched functions. If such functions are found, the task can't be + * switched yet. + * + * If any tasks are still stuck in the initial patch state, schedule a retry. + */ +void klp_try_complete_transition(void) +{ + unsigned int cpu; + struct task_struct *g, *task; + bool complete = true; + + WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); + + /* + * If the patch can be applied or reverted immediately, skip the + * per-task transitions. + */ + if (klp_transition_patch->immediate) + goto success; + + /* + * Try to switch the tasks to the target patch state by walking their + * stacks and looking for any to-be-patched or to-be-unpatched + * functions. If such functions are found on a stack, or if the stack + * is deemed unreliable, the task can't be switched yet. + * + * Usually this will transition most (or all) of the tasks on a system + * unless the patch includes changes to a very common function. + */ + read_lock(&tasklist_lock); + for_each_process_thread(g, task) + if (!klp_try_switch_task(task)) + complete = false; + read_unlock(&tasklist_lock); + + /* + * Ditto for the idle "swapper" tasks. + */ + get_online_cpus(); + for_each_possible_cpu(cpu) { + task = idle_task(cpu); + if (cpu_online(cpu)) { + if (!klp_try_switch_task(task)) + complete = false; + } else if (task->patch_state != klp_target_state) { + /* offline idle tasks can be switched immediately */ + clear_tsk_thread_flag(task, TIF_PATCH_PENDING); + task->patch_state = klp_target_state; + } + } + put_online_cpus(); + + if (!complete) { + /* + * Some tasks weren't able to be switched over. Try again + * later and/or wait for other methods like kernel exit + * switching. + */ + schedule_delayed_work(&klp_transition_work, + round_jiffies_relative(HZ)); + return; + } + +success: + pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + + /* we're done, now cleanup the data structures */ + klp_complete_transition(); +} + +/* + * Start the transition to the specified target patch state so tasks can begin + * switching to it. + */ +void klp_start_transition(void) +{ + struct task_struct *g, *task; + unsigned int cpu; + + WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); + + pr_notice("'%s': %s...\n", klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + + /* + * If the patch can be applied or reverted immediately, skip the + * per-task transitions. + */ + if (klp_transition_patch->immediate) + return; + + /* + * Mark all normal tasks as needing a patch state update. They'll + * switch either in klp_try_complete_transition() or as they exit the + * kernel. + */ + read_lock(&tasklist_lock); + for_each_process_thread(g, task) + if (task->patch_state != klp_target_state) + set_tsk_thread_flag(task, TIF_PATCH_PENDING); + read_unlock(&tasklist_lock); + + /* + * Mark all idle tasks as needing a patch state update. They'll switch + * either in klp_try_complete_transition() or at the idle loop switch + * point. + */ + for_each_possible_cpu(cpu) { + task = idle_task(cpu); + if (task->patch_state != klp_target_state) + set_tsk_thread_flag(task, TIF_PATCH_PENDING); + } +} + +/* + * Initialize the global target patch state and all tasks to the initial patch + * state, and initialize all function transition states to true in preparation + * for patching or unpatching. + */ +void klp_init_transition(struct klp_patch *patch, int state) +{ + struct task_struct *g, *task; + unsigned int cpu; + struct klp_object *obj; + struct klp_func *func; + int initial_state = !state; + + WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED); + + klp_transition_patch = patch; + + /* + * Set the global target patch state which tasks will switch to. This + * has no effect until the TIF_PATCH_PENDING flags get set later. + */ + klp_target_state = state; + + /* + * If the patch can be applied or reverted immediately, skip the + * per-task transitions. + */ + if (patch->immediate) + return; + + /* + * Initialize all tasks to the initial patch state to prepare them for + * switching to the target state. + */ + read_lock(&tasklist_lock); + for_each_process_thread(g, task) { + WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); + task->patch_state = initial_state; + } + read_unlock(&tasklist_lock); + + /* + * Ditto for the idle "swapper" tasks. + */ + for_each_possible_cpu(cpu) { + task = idle_task(cpu); + WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); + task->patch_state = initial_state; + } + + /* + * Enforce the order of the task->patch_state initializations and the + * func->transition updates to ensure that klp_ftrace_handler() doesn't + * see a func in transition with a task->patch_state of KLP_UNDEFINED. + * + * Also enforce the order of the klp_target_state write and future + * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't + * set a task->patch_state to KLP_UNDEFINED. + */ + smp_wmb(); + + /* + * Set the func transition states so klp_ftrace_handler() will know to + * switch to the transition logic. + * + * When patching, the funcs aren't yet in the func_stack and will be + * made visible to the ftrace handler shortly by the calls to + * klp_patch_object(). + * + * When unpatching, the funcs are already in the func_stack and so are + * already visible to the ftrace handler. + */ + klp_for_each_object(patch, obj) + klp_for_each_func(obj, func) + func->transition = true; +} + +/* + * This function can be called in the middle of an existing transition to + * reverse the direction of the target patch state. This can be done to + * effectively cancel an existing enable or disable operation if there are any + * tasks which are stuck in the initial patch state. + */ +void klp_reverse_transition(void) +{ + unsigned int cpu; + struct task_struct *g, *task; + + klp_transition_patch->enabled = !klp_transition_patch->enabled; + + klp_target_state = !klp_target_state; + + /* + * Clear all TIF_PATCH_PENDING flags to prevent races caused by + * klp_update_patch_state() running in parallel with + * klp_start_transition(). + */ + read_lock(&tasklist_lock); + for_each_process_thread(g, task) + clear_tsk_thread_flag(task, TIF_PATCH_PENDING); + read_unlock(&tasklist_lock); + + for_each_possible_cpu(cpu) + clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING); + + /* Let any remaining calls to klp_update_patch_state() complete */ + synchronize_rcu(); + + klp_start_transition(); +} + +/* Called from copy_process() during fork */ +void klp_copy_process(struct task_struct *child) +{ + child->patch_state = current->patch_state; + + /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */ +} diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h new file mode 100644 index 000000000000..ce09b326546c --- /dev/null +++ b/kernel/livepatch/transition.h @@ -0,0 +1,14 @@ +#ifndef _LIVEPATCH_TRANSITION_H +#define _LIVEPATCH_TRANSITION_H + +#include <linux/livepatch.h> + +extern struct klp_patch *klp_transition_patch; + +void klp_init_transition(struct klp_patch *patch, int state); +void klp_cancel_transition(void); +void klp_start_transition(void); +void klp_try_complete_transition(void); +void klp_reverse_transition(void); + +#endif /* _LIVEPATCH_TRANSITION_H */ diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6f88e352cd4f..760158d9d98d 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -28,3 +28,4 @@ obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o +obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7c38f8f3d97b..c0e31bfee25c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -28,6 +28,9 @@ #define DISABLE_BRANCH_PROFILING #include <linux/mutex.h> #include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/sched/task.h> +#include <linux/sched/mm.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/proc_fs.h> @@ -658,6 +661,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; + bool is_static = false; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); @@ -671,10 +675,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) /* * Static locks do not have their class-keys yet - for them the key - * is the lock object itself: + * is the lock object itself. If the lock is in the per cpu area, + * the canonical address of the lock (per cpu offset removed) is + * used. */ - if (unlikely(!lock->key)) - lock->key = (void *)lock; + if (unlikely(!lock->key)) { + unsigned long can_addr, addr = (unsigned long)lock; + + if (__is_kernel_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (__is_module_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (static_obj(lock)) + lock->key = (void *)lock; + else + return ERR_PTR(-EINVAL); + is_static = true; + } /* * NOTE: the class-key must be unique. For dynamic locks, a static @@ -706,7 +723,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } } - return NULL; + return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } /* @@ -724,19 +741,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); - if (likely(class)) + if (likely(!IS_ERR_OR_NULL(class))) goto out_set_class_cache; /* * Debug-check: all keys must be persistent! - */ - if (!static_obj(lock->key)) { + */ + if (IS_ERR(class)) { debug_locks_off(); printk("INFO: trying to register non-static key.\n"); printk("the code is fine but needs lockdep annotation.\n"); printk("turning off the locking correctness validator.\n"); dump_stack(); - return NULL; } @@ -1142,10 +1158,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, return 0; printk("\n"); - printk("======================================================\n"); - printk("[ INFO: possible circular locking dependency detected ]\n"); + pr_warn("======================================================\n"); + pr_warn("WARNING: possible circular locking dependency detected\n"); print_kernel_ident(); - printk("-------------------------------------------------------\n"); + pr_warn("------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); @@ -1480,11 +1496,11 @@ print_bad_irq_dependency(struct task_struct *curr, return 0; printk("\n"); - printk("======================================================\n"); - printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", + pr_warn("=====================================================\n"); + pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", irqclass, irqclass); print_kernel_ident(); - printk("------------------------------------------------------\n"); + pr_warn("-----------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, @@ -1709,10 +1725,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, return 0; printk("\n"); - printk("=============================================\n"); - printk("[ INFO: possible recursive locking detected ]\n"); + pr_warn("============================================\n"); + pr_warn("WARNING: possible recursive locking detected\n"); print_kernel_ident(); - printk("---------------------------------------------\n"); + pr_warn("--------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); @@ -2059,10 +2075,10 @@ static void print_collision(struct task_struct *curr, struct lock_chain *chain) { printk("\n"); - printk("======================\n"); - printk("[chain_key collision ]\n"); + pr_warn("============================\n"); + pr_warn("WARNING: chain_key collision\n"); print_kernel_ident(); - printk("----------------------\n"); + pr_warn("----------------------------\n"); printk("%s/%d: ", current->comm, task_pid_nr(current)); printk("Hash chain already cached but the contents don't match!\n"); @@ -2203,7 +2219,7 @@ cache_hit: * Important for check_no_collision(). */ if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { - if (debug_locks_off_graph_unlock()) + if (!debug_locks_off_graph_unlock()) return 0; print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); @@ -2358,10 +2374,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, return 0; printk("\n"); - printk("=================================\n"); - printk("[ INFO: inconsistent lock state ]\n"); + pr_warn("================================\n"); + pr_warn("WARNING: inconsistent lock state\n"); print_kernel_ident(); - printk("---------------------------------\n"); + pr_warn("--------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); @@ -2423,10 +2439,10 @@ print_irq_inversion_bug(struct task_struct *curr, return 0; printk("\n"); - printk("=========================================================\n"); - printk("[ INFO: possible irq lock inversion dependency detected ]\n"); + pr_warn("========================================================\n"); + pr_warn("WARNING: possible irq lock inversion dependency detected\n"); print_kernel_ident(); - printk("---------------------------------------------------------\n"); + pr_warn("--------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); @@ -2861,6 +2877,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (unlikely(!debug_locks)) return; + gfp_mask = current_gfp_context(gfp_mask); + /* no reclaim without waiting on it */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; @@ -2870,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS)) + if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) return; /* @@ -2879,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) return; + /* Disable lockdep if explicitly requested */ + if (gfp_mask & __GFP_NOLOCKDEP) + return; + mark_held_locks(curr, RECLAIM_FS); } @@ -3168,10 +3190,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; printk("\n"); - printk("==================================\n"); - printk("[ BUG: Nested lock was not taken ]\n"); + pr_warn("==================================\n"); + pr_warn("WARNING: Nested lock was not taken\n"); print_kernel_ident(); - printk("----------------------------------\n"); + pr_warn("----------------------------------\n"); printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); print_lock(hlock); @@ -3260,10 +3282,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) + if (hlock->references) { + /* + * Check: unsigned int references:12, overflow. + */ + if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) + return 0; + hlock->references++; - else + } else { hlock->references = 2; + } return 1; } @@ -3374,10 +3403,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; printk("\n"); - printk("=====================================\n"); - printk("[ BUG: bad unlock balance detected! ]\n"); + pr_warn("=====================================\n"); + pr_warn("WARNING: bad unlock balance detected!\n"); print_kernel_ident(); - printk("-------------------------------------\n"); + pr_warn("-------------------------------------\n"); printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3410,7 +3439,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) * Clearly if the lock hasn't been acquired _ever_, we're not * holding it either, so report failure. */ - if (!class) + if (IS_ERR_OR_NULL(class)) return 0; /* @@ -3428,13 +3457,67 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) return 0; } +/* @depth must not be zero */ +static struct held_lock *find_held_lock(struct task_struct *curr, + struct lockdep_map *lock, + unsigned int depth, int *idx) +{ + struct held_lock *ret, *hlock, *prev_hlock; + int i; + + i = depth - 1; + hlock = curr->held_locks + i; + ret = hlock; + if (match_held_lock(hlock, lock)) + goto out; + + ret = NULL; + for (i--, prev_hlock = hlock--; + i >= 0; + i--, prev_hlock = hlock--) { + /* + * We must not cross into another context: + */ + if (prev_hlock->irq_context != hlock->irq_context) { + ret = NULL; + break; + } + if (match_held_lock(hlock, lock)) { + ret = hlock; + break; + } + } + +out: + *idx = i; + return ret; +} + +static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, + int idx) +{ + struct held_lock *hlock; + + for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { + if (!__lock_acquire(hlock->instance, + hlock_class(hlock)->subclass, + hlock->trylock, + hlock->read, hlock->check, + hlock->hardirqs_off, + hlock->nest_lock, hlock->acquire_ip, + hlock->references, hlock->pin_count)) + return 1; + } + return 0; +} + static int __lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class *class; unsigned int depth; int i; @@ -3447,21 +3530,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_imbalance_bug(curr, lock, ip); + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); -found_it: lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes + 1; @@ -3469,15 +3541,46 @@ found_it: curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - for (; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) - return 0; - } + if (reacquire_held_locks(curr, depth, i)) + return 0; + + /* + * I took it apart and put it back together again, except now I have + * these 'spare' parts.. where shall I put them. + */ + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + +static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + /* + * This function is about (re)setting the class of a held lock, + * yet we're not actually holding any locks. Naughty user! + */ + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + WARN(hlock->read, "downgrading a read lock"); + hlock->read = 1; + hlock->acquire_ip = ip; + + if (reacquire_held_locks(curr, depth, i)) + return 0; /* * I took it apart and put it back together again, except now I have @@ -3499,7 +3602,7 @@ static int __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; unsigned int depth; int i; @@ -3518,21 +3621,10 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) * Check whether the lock exists in the current stack * of held locks: */ - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_imbalance_bug(curr, lock, ip); + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); -found_it: if (hlock->instance == lock) lock_release_holdtime(hlock); @@ -3559,15 +3651,8 @@ found_it: curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - for (i++; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) - return 0; - } + if (reacquire_held_locks(curr, depth, i + 1)) + return 0; /* * We had N bottles of beer on the wall, we drank one, but now @@ -3732,6 +3817,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); +void lock_downgrade(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_downgrade(lock, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_downgrade); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -3852,13 +3954,15 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock); void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { - current->lockdep_reclaim_gfp = gfp_mask; + current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); } +EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state); void lockdep_clear_current_reclaim_state(void) { current->lockdep_reclaim_gfp = 0; } +EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state); #ifdef CONFIG_LOCK_STAT static int @@ -3871,10 +3975,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; printk("\n"); - printk("=================================\n"); - printk("[ BUG: bad contention detected! ]\n"); + pr_warn("=================================\n"); + pr_warn("WARNING: bad contention detected!\n"); print_kernel_ident(); - printk("---------------------------------\n"); + pr_warn("---------------------------------\n"); printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3894,7 +3998,7 @@ static void __lock_contended(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class_stats *stats; unsigned int depth; int i, contention_point, contending_point; @@ -3907,22 +4011,12 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) { + print_lock_contention_bug(curr, lock, ip); + return; } - print_lock_contention_bug(curr, lock, ip); - return; -found_it: if (hlock->instance != lock) return; @@ -3946,7 +4040,7 @@ static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class_stats *stats; unsigned int depth; u64 now, waittime = 0; @@ -3960,22 +4054,12 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) { + print_lock_contention_bug(curr, lock, _RET_IP_); + return; } - print_lock_contention_bug(curr, lock, _RET_IP_); - return; -found_it: if (hlock->instance != lock) return; @@ -4163,7 +4247,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) * If the class exists we look it up and zap it: */ class = look_up_lock_class(lock, j); - if (class) + if (!IS_ERR_OR_NULL(class)) zap_class(class); } /* @@ -4235,10 +4319,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, return; printk("\n"); - printk("=========================\n"); - printk("[ BUG: held lock freed! ]\n"); + pr_warn("=========================\n"); + pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); - printk("-------------------------\n"); + pr_warn("-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); @@ -4293,11 +4377,11 @@ static void print_held_locks_bug(void) return; printk("\n"); - printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", + pr_warn("====================================\n"); + pr_warn("WARNING: %s/%d still has locks held!\n", current->comm, task_pid_nr(current)); print_kernel_ident(); - printk("-------------------------------------\n"); + pr_warn("------------------------------------\n"); lockdep_print_held_locks(current); printk("\nstack backtrace:\n"); dump_stack(); @@ -4362,7 +4446,7 @@ retry: } while_each_thread(g, p); printk("\n"); - printk("=============================================\n\n"); + pr_warn("=============================================\n\n"); if (unlock) read_unlock(&tasklist_lock); @@ -4392,10 +4476,10 @@ asmlinkage __visible void lockdep_sys_exit(void) if (!debug_locks_off()) return; printk("\n"); - printk("================================================\n"); - printk("[ BUG: lock held when returning to user space! ]\n"); + pr_warn("================================================\n"); + pr_warn("WARNING: lock held when returning to user space!\n"); print_kernel_ident(); - printk("------------------------------------------------\n"); + pr_warn("------------------------------------------------\n"); printk("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); @@ -4412,10 +4496,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ printk("\n"); - printk("===============================\n"); - printk("[ INFO: suspicious RCU usage. ]\n"); + pr_warn("=============================\n"); + pr_warn("WARNING: suspicious RCU usage\n"); print_kernel_ident(); - printk("-------------------------------\n"); + pr_warn("-----------------------------\n"); printk("%s:%d %s!\n", file, line, s); printk("\nother info that might help us debug this:\n\n"); printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index c2b88490d857..c08fbd2f5ba9 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -46,13 +46,13 @@ enum { (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) /* - * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text, + * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, * .data and .bss to fit in required 32MB limit for the kernel. With - * PROVE_LOCKING we could go over this limit and cause system boot-up problems. + * CONFIG_LOCKDEP we could go over this limit and cause system boot-up problems. * So, reduce the static allocations for lockdeps related structures so that * everything fits in current required size limit. */ -#ifdef CONFIG_PROVE_LOCKING_SMALL +#ifdef CONFIG_LOCKDEP_SMALL /* * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies * we track. diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f8c5af52a131..f24582d4dad3 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -32,6 +32,8 @@ #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/sched.h> +#include <uapi/linux/sched/types.h> +#include <linux/rtmutex.h> #include <linux/atomic.h> #include <linux/moduleparam.h> #include <linux/delay.h> @@ -372,6 +374,78 @@ static struct lock_torture_ops mutex_lock_ops = { .name = "mutex_lock" }; +#include <linux/ww_mutex.h> +static DEFINE_WW_CLASS(torture_ww_class); +static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class); +static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class); +static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class); + +static int torture_ww_mutex_lock(void) +__acquires(torture_ww_mutex_0) +__acquires(torture_ww_mutex_1) +__acquires(torture_ww_mutex_2) +{ + LIST_HEAD(list); + struct reorder_lock { + struct list_head link; + struct ww_mutex *lock; + } locks[3], *ll, *ln; + struct ww_acquire_ctx ctx; + + locks[0].lock = &torture_ww_mutex_0; + list_add(&locks[0].link, &list); + + locks[1].lock = &torture_ww_mutex_1; + list_add(&locks[1].link, &list); + + locks[2].lock = &torture_ww_mutex_2; + list_add(&locks[2].link, &list); + + ww_acquire_init(&ctx, &torture_ww_class); + + list_for_each_entry(ll, &list, link) { + int err; + + err = ww_mutex_lock(ll->lock, &ctx); + if (!err) + continue; + + ln = ll; + list_for_each_entry_continue_reverse(ln, &list, link) + ww_mutex_unlock(ln->lock); + + if (err != -EDEADLK) + return err; + + ww_mutex_lock_slow(ll->lock, &ctx); + list_move(&ll->link, &list); + } + + ww_acquire_fini(&ctx); + return 0; +} + +static void torture_ww_mutex_unlock(void) +__releases(torture_ww_mutex_0) +__releases(torture_ww_mutex_1) +__releases(torture_ww_mutex_2) +{ + ww_mutex_unlock(&torture_ww_mutex_0); + ww_mutex_unlock(&torture_ww_mutex_1); + ww_mutex_unlock(&torture_ww_mutex_2); +} + +static struct lock_torture_ops ww_mutex_lock_ops = { + .writelock = torture_ww_mutex_lock, + .write_delay = torture_mutex_delay, + .task_boost = torture_boost_dummy, + .writeunlock = torture_ww_mutex_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "ww_mutex_lock" +}; + #ifdef CONFIG_RT_MUTEXES static DEFINE_RT_MUTEX(torture_rtmutex); @@ -780,6 +854,10 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); + + kfree(cxt.lwsa); + kfree(cxt.lrsa); + end: torture_cleanup_end(); } @@ -793,6 +871,7 @@ static int __init lock_torture_init(void) &spin_lock_ops, &spin_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, + &ww_mutex_lock_ops, #ifdef CONFIG_RT_MUTEXES &rtmutex_lock_ops, #endif @@ -924,6 +1003,8 @@ static int __init lock_torture_init(void) GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + kfree(writer_tasks); + writer_tasks = NULL; firsterr = -ENOMEM; goto unwind; } diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index a459faa48987..4174417d5309 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -26,20 +26,3 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); - -#define spin_lock_mutex(lock, flags) \ - do { \ - struct mutex *l = container_of(lock, struct mutex, wait_lock); \ - \ - DEBUG_LOCKS_WARN_ON(in_interrupt()); \ - local_irq_save(flags); \ - arch_spin_lock(&(lock)->rlock.raw_lock);\ - DEBUG_LOCKS_WARN_ON(l->magic != l); \ - } while (0) - -#define spin_unlock_mutex(lock, flags) \ - do { \ - arch_spin_unlock(&(lock)->rlock.raw_lock); \ - local_irq_restore(flags); \ - preempt_check_resched(); \ - } while (0) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 9b349619f431..198527a62149 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -19,8 +19,10 @@ */ #include <linux/mutex.h> #include <linux/ww_mutex.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/sched/rt.h> +#include <linux/sched/wake_q.h> +#include <linux/sched/debug.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/interrupt.h> @@ -50,16 +52,17 @@ EXPORT_SYMBOL(__mutex_init); /* * @owner: contains: 'struct task_struct *' to the current lock owner, * NULL means not owned. Since task_struct pointers are aligned at - * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low - * bits to store extra state. + * at least L1_CACHE_BYTES, we have low bits to store extra state. * * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. * Bit1 indicates unlock needs to hand the lock to the top-waiter + * Bit2 indicates handoff has been done and we're waiting for pickup. */ #define MUTEX_FLAG_WAITERS 0x01 #define MUTEX_FLAG_HANDOFF 0x02 +#define MUTEX_FLAG_PICKUP 0x04 -#define MUTEX_FLAGS 0x03 +#define MUTEX_FLAGS 0x07 static inline struct task_struct *__owner_task(unsigned long owner) { @@ -72,38 +75,29 @@ static inline unsigned long __owner_flags(unsigned long owner) } /* - * Actual trylock that will work on any unlocked state. - * - * When setting the owner field, we must preserve the low flag bits. - * - * Be careful with @handoff, only set that in a wait-loop (where you set - * HANDOFF) to avoid recursive lock attempts. + * Trylock variant that retuns the owning task on failure. */ -static inline bool __mutex_trylock(struct mutex *lock, const bool handoff) +static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) { unsigned long owner, curr = (unsigned long)current; owner = atomic_long_read(&lock->owner); for (;;) { /* must loop, can race against a flag */ unsigned long old, flags = __owner_flags(owner); + unsigned long task = owner & ~MUTEX_FLAGS; - if (__owner_task(owner)) { - if (handoff && unlikely(__owner_task(owner) == current)) { - /* - * Provide ACQUIRE semantics for the lock-handoff. - * - * We cannot easily use load-acquire here, since - * the actual load is a failed cmpxchg, which - * doesn't imply any barriers. - * - * Also, this is a fairly unlikely scenario, and - * this contains the cost. - */ - smp_mb(); /* ACQUIRE */ - return true; - } + if (task) { + if (likely(task != curr)) + break; - return false; + if (likely(!(flags & MUTEX_FLAG_PICKUP))) + break; + + flags &= ~MUTEX_FLAG_PICKUP; + } else { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP); +#endif } /* @@ -111,15 +105,24 @@ static inline bool __mutex_trylock(struct mutex *lock, const bool handoff) * past the point where we acquire it. This would be possible * if we (accidentally) set the bit on an unlocked mutex. */ - if (handoff) - flags &= ~MUTEX_FLAG_HANDOFF; + flags &= ~MUTEX_FLAG_HANDOFF; old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags); if (old == owner) - return true; + return NULL; owner = old; } + + return __owner_task(owner); +} + +/* + * Actual trylock that will work on any unlocked state. + */ +static inline bool __mutex_trylock(struct mutex *lock) +{ + return !__mutex_trylock_or_owner(lock); } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -171,9 +174,9 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait /* * Give up ownership to a specific task, when @task = NULL, this is equivalent - * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE - * semantics like a regular unlock, the __mutex_trylock() provides matching - * ACQUIRE semantics for the handoff. + * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves + * WAITERS. Provides RELEASE semantics like a regular unlock, the + * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff. */ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) { @@ -184,10 +187,13 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); + DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); #endif new = (owner & MUTEX_FLAG_WAITERS); new |= (unsigned long)task; + if (task) + new |= MUTEX_FLAG_PICKUP; old = atomic_long_cmpxchg_release(&lock->owner, owner, new); if (old == owner) @@ -237,8 +243,8 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif -static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, - struct ww_acquire_ctx *ww_ctx) +static __always_inline void +ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) { #ifdef CONFIG_DEBUG_MUTEXES /* @@ -277,17 +283,50 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, ww_ctx->acquired++; } +static inline bool __sched +__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) +{ + return a->stamp - b->stamp <= LONG_MAX && + (a->stamp != b->stamp || a > b); +} + +/* + * Wake up any waiters that may have to back off when the lock is held by the + * given context. + * + * Due to the invariants on the wait list, this can only affect the first + * waiter with a context. + * + * The current task must not be on the wait list. + */ +static void __sched +__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + struct mutex_waiter *cur; + + lockdep_assert_held(&lock->wait_lock); + + list_for_each_entry(cur, &lock->wait_list, list) { + if (!cur->ww_ctx) + continue; + + if (cur->ww_ctx->acquired > 0 && + __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) { + debug_mutex_wake_waiter(lock, cur); + wake_up_process(cur->task); + } + + break; + } +} + /* * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. */ static __always_inline void -ww_mutex_set_context_fastpath(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) +ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - unsigned long flags; - struct mutex_waiter *cur; - ww_mutex_lock_acquired(lock, ctx); lock->ctx = ctx; @@ -311,46 +350,79 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, * Uh oh, we raced in fastpath, wake up everyone in this case, * so they can see the new lock->ctx. */ - spin_lock_mutex(&lock->base.wait_lock, flags); - list_for_each_entry(cur, &lock->base.wait_list, list) { - debug_mutex_wake_waiter(&lock->base, cur); - wake_up_process(cur->task); - } - spin_unlock_mutex(&lock->base.wait_lock, flags); + spin_lock(&lock->base.wait_lock); + __ww_mutex_wakeup_for_backoff(&lock->base, ctx); + spin_unlock(&lock->base.wait_lock); } /* - * After acquiring lock in the slowpath set ctx and wake up any - * waiters so they can recheck. + * After acquiring lock in the slowpath set ctx. + * + * Unlike for the fast path, the caller ensures that waiters are woken up where + * necessary. * * Callers must hold the mutex wait_lock. */ static __always_inline void -ww_mutex_set_context_slowpath(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) +ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - struct mutex_waiter *cur; - ww_mutex_lock_acquired(lock, ctx); lock->ctx = ctx; +} + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + +static inline +bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, + struct mutex_waiter *waiter) +{ + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); /* - * Give any possible sleeping processes the chance to wake up, - * so they can recheck if they have to back off. + * If ww->ctx is set the contents are undefined, only + * by acquiring wait_lock there is a guarantee that + * they are not invalid when reading. + * + * As such, when deadlock detection needs to be + * performed the optimistic spinning cannot be done. + * + * Check this in every inner iteration because we may + * be racing against another thread's ww_mutex_lock. */ - list_for_each_entry(cur, &lock->base.wait_list, list) { - debug_mutex_wake_waiter(&lock->base, cur); - wake_up_process(cur->task); - } + if (ww_ctx->acquired > 0 && READ_ONCE(ww->ctx)) + return false; + + /* + * If we aren't on the wait list yet, cancel the spin + * if there are waiters. We want to avoid stealing the + * lock from a waiter with an earlier stamp, since the + * other thread may already own a lock that we also + * need. + */ + if (!waiter && (atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS)) + return false; + + /* + * Similarly, stop spinning if we are no longer the + * first waiter. + */ + if (waiter && !__mutex_waiter_is_first(lock, waiter)) + return false; + + return true; } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. + * Look out! "owner" is an entirely speculative pointer access and not + * reliable. + * + * "noinline" so that this function shows up on perf profiles. */ static noinline -bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, + struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter) { bool ret = true; @@ -373,6 +445,11 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) break; } + if (ww_ctx && !ww_mutex_spin_on_owner(lock, ww_ctx, waiter)) { + ret = false; + break; + } + cpu_relax(); } rcu_read_unlock(); @@ -431,12 +508,10 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) * with the spinner at the head of the OSQ, if present, until the owner is * changed to itself. */ -static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, const bool waiter) +static __always_inline bool +mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, struct mutex_waiter *waiter) { - struct task_struct *task = current; - if (!waiter) { /* * The purpose of the mutex_can_spin_on_owner() function is @@ -460,40 +535,17 @@ static bool mutex_optimistic_spin(struct mutex *lock, for (;;) { struct task_struct *owner; - if (use_ww_ctx && ww_ctx->acquired > 0) { - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - /* - * If ww->ctx is set the contents are undefined, only - * by acquiring wait_lock there is a guarantee that - * they are not invalid when reading. - * - * As such, when deadlock detection needs to be - * performed the optimistic spinning cannot be done. - */ - if (READ_ONCE(ww->ctx)) - goto fail_unlock; - } + /* Try to acquire the mutex... */ + owner = __mutex_trylock_or_owner(lock); + if (!owner) + break; /* - * If there's an owner, wait for it to either + * There's an owner, wait for it to either * release the lock or go to sleep. */ - owner = __mutex_owner(lock); - if (owner) { - if (waiter && owner == task) { - smp_mb(); /* ACQUIRE */ - break; - } - - if (!mutex_spin_on_owner(lock, owner)) - goto fail_unlock; - } - - /* Try to acquire the mutex if it is unlocked. */ - if (__mutex_trylock(lock, waiter)) - break; + if (!mutex_spin_on_owner(lock, owner, ww_ctx, waiter)) + goto fail_unlock; /* * The cpu_relax() call is a compiler barrier which forces @@ -532,9 +584,9 @@ fail: return false; } #else -static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, const bool waiter) +static __always_inline bool +mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, struct mutex_waiter *waiter) { return false; } @@ -594,23 +646,88 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) EXPORT_SYMBOL(ww_mutex_unlock); static inline int __sched -__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); + struct mutex_waiter *cur; + + if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) + goto deadlock; + + /* + * If there is a waiter in front of us that has a context, then its + * stamp is earlier than ours and we must back off. + */ + cur = waiter; + list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { + if (cur->ww_ctx) + goto deadlock; + } + + return 0; - if (!hold_ctx) +deadlock: +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(ctx->contending_lock); + ctx->contending_lock = ww; +#endif + return -EDEADLK; +} + +static inline int __sched +__ww_mutex_add_waiter(struct mutex_waiter *waiter, + struct mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ + struct mutex_waiter *cur; + struct list_head *pos; + + if (!ww_ctx) { + list_add_tail(&waiter->list, &lock->wait_list); return 0; + } - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && - (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { + /* + * Add the waiter before the first waiter with a higher stamp. + * Waiters without a context are skipped to avoid starving + * them. + */ + pos = &lock->wait_list; + list_for_each_entry_reverse(cur, &lock->wait_list, list) { + if (!cur->ww_ctx) + continue; + + if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { + /* Back off immediately if necessary. */ + if (ww_ctx->acquired > 0) { #ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(ctx->contending_lock); - ctx->contending_lock = ww; + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); + ww_ctx->contending_lock = ww; #endif - return -EDEADLK; + return -EDEADLK; + } + + break; + } + + pos = &cur->list; + + /* + * Wake up the waiter so that it gets a chance to back + * off. + */ + if (cur->ww_ctx->acquired > 0) { + debug_mutex_wake_waiter(lock, cur); + wake_up_process(cur->task); + } } + list_add_tail(&waiter->list, pos); return 0; } @@ -622,15 +739,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { - struct task_struct *task = current; struct mutex_waiter waiter; - unsigned long flags; bool first = false; struct ww_mutex *ww; int ret; - if (use_ww_ctx) { - ww = container_of(lock, struct ww_mutex, base); + might_sleep(); + + ww = container_of(lock, struct ww_mutex, base); + if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; } @@ -638,36 +755,54 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); - if (__mutex_trylock(lock, false) || - mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) { + if (__mutex_trylock(lock) || + mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) + if (use_ww_ctx && ww_ctx) ww_mutex_set_context_fastpath(ww, ww_ctx); preempt_enable(); return 0; } - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); /* * After waiting to acquire the wait_lock, try again. */ - if (__mutex_trylock(lock, false)) + if (__mutex_trylock(lock)) { + if (use_ww_ctx && ww_ctx) + __ww_mutex_wakeup_for_backoff(lock, ww_ctx); + goto skip_wait; + } debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task); + debug_mutex_add_waiter(lock, &waiter, current); + + lock_contended(&lock->dep_map, ip); - /* add waiting tasks to the end of the waitqueue (FIFO): */ - list_add_tail(&waiter.list, &lock->wait_list); - waiter.task = task; + if (!use_ww_ctx) { + /* add waiting tasks to the end of the waitqueue (FIFO): */ + list_add_tail(&waiter.list, &lock->wait_list); + +#ifdef CONFIG_DEBUG_MUTEXES + waiter.ww_ctx = MUTEX_POISON_WW_CTX; +#endif + } else { + /* Add in stamp order, waking up waiters that must back off. */ + ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); + if (ret) + goto err_early_backoff; + + waiter.ww_ctx = ww_ctx; + } + + waiter.task = current; if (__mutex_waiter_is_first(lock, &waiter)) __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); - lock_contended(&lock->dep_map, ip); - - set_task_state(task, state); + set_current_state(state); for (;;) { /* * Once we hold wait_lock, we're serialized against @@ -675,7 +810,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * before testing the error conditions to make sure we pick up * the handoff. */ - if (__mutex_trylock(lock, first)) + if (__mutex_trylock(lock)) goto acquired; /* @@ -683,42 +818,47 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * wait_lock. This ensures the lock cancellation is ordered * against mutex_unlock() and wake-ups do not go missing. */ - if (unlikely(signal_pending_state(state, task))) { + if (unlikely(signal_pending_state(state, current))) { ret = -EINTR; goto err; } - if (use_ww_ctx && ww_ctx->acquired > 0) { - ret = __ww_mutex_lock_check_stamp(lock, ww_ctx); + if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) { + ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx); if (ret) goto err; } - spin_unlock_mutex(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); schedule_preempt_disabled(); - if (!first && __mutex_waiter_is_first(lock, &waiter)) { - first = true; - __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); + /* + * ww_mutex needs to always recheck its position since its waiter + * list is not FIFO ordered. + */ + if ((use_ww_ctx && ww_ctx) || !first) { + first = __mutex_waiter_is_first(lock, &waiter); + if (first) + __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); } - set_task_state(task, state); + set_current_state(state); /* * Here we order against unlock; we must either see it change * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) || - __mutex_trylock(lock, first)) + if (__mutex_trylock(lock) || + (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter))) break; - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); } - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); acquired: - __set_task_state(task, TASK_RUNNING); + __set_current_state(TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, task); + mutex_remove_waiter(lock, &waiter, current); if (likely(list_empty(&lock->wait_list))) __mutex_clear_flag(lock, MUTEX_FLAGS); @@ -728,30 +868,44 @@ skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) + if (use_ww_ctx && ww_ctx) ww_mutex_set_context_slowpath(ww, ww_ctx); - spin_unlock_mutex(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); preempt_enable(); return 0; err: - __set_task_state(task, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, task); - spin_unlock_mutex(&lock->wait_lock, flags); + __set_current_state(TASK_RUNNING); + mutex_remove_waiter(lock, &waiter, current); +err_early_backoff: + spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); preempt_enable(); return ret; } +static int __sched +__mutex_lock(struct mutex *lock, long state, unsigned int subclass, + struct lockdep_map *nest_lock, unsigned long ip) +{ + return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false); +} + +static int __sched +__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass, + struct lockdep_map *nest_lock, unsigned long ip, + struct ww_acquire_ctx *ww_ctx) +{ + return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true); +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { - might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL, 0); + __mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -759,30 +913,38 @@ EXPORT_SYMBOL_GPL(mutex_lock_nested); void __sched _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { - might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - 0, nest, _RET_IP_, NULL, 0); + __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { - might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, - subclass, NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); int __sched mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { - might_sleep(); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); +void __sched +mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) +{ + int token; + + might_sleep(); + + token = io_schedule_prepare(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, + subclass, NULL, _RET_IP_, NULL, 0); + io_schedule_finish(token); +} +EXPORT_SYMBOL_GPL(mutex_lock_io_nested); + static inline int ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { @@ -810,35 +972,37 @@ ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) } int __sched -__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { int ret; might_sleep(); - ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx, 1); - if (!ret && ctx->acquired > 1) + ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, + 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, + ctx); + if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); return ret; } -EXPORT_SYMBOL_GPL(__ww_mutex_lock); +EXPORT_SYMBOL_GPL(ww_mutex_lock); int __sched -__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { int ret; might_sleep(); - ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx, 1); + ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, + 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, + ctx); - if (!ret && ctx->acquired > 1) + if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); return ret; } -EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); +EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); #endif @@ -848,8 +1012,8 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) { struct task_struct *next = NULL; - unsigned long owner, flags; DEFINE_WAKE_Q(wake_q); + unsigned long owner; mutex_release(&lock->dep_map, 1, ip); @@ -866,6 +1030,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); + DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); #endif if (owner & MUTEX_FLAG_HANDOFF) @@ -883,7 +1048,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne owner = old; } - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); debug_mutex_unlock(lock); if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ @@ -900,7 +1065,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - spin_unlock_mutex(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); wake_up_q(&wake_q); } @@ -950,40 +1115,47 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); +void __sched mutex_lock_io(struct mutex *lock) +{ + int token; + + token = io_schedule_prepare(); + mutex_lock(lock); + io_schedule_finish(token); +} +EXPORT_SYMBOL_GPL(mutex_lock_io); + static noinline void __sched __mutex_lock_slowpath(struct mutex *lock) { - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL, 0); + __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) { - return __mutex_lock_common(lock, TASK_KILLABLE, 0, - NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) { - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx, 1); + return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL, + _RET_IP_, ctx); } static noinline int __sched __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx, 1); + return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL, + _RET_IP_, ctx); } #endif @@ -1004,7 +1176,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { - bool locked = __mutex_trylock(lock, false); + bool locked = __mutex_trylock(lock); if (locked) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); @@ -1015,32 +1187,34 @@ EXPORT_SYMBOL(mutex_trylock); #ifndef CONFIG_DEBUG_LOCK_ALLOC int __sched -__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { might_sleep(); if (__mutex_trylock_fast(&lock->base)) { - ww_mutex_set_context_fastpath(lock, ctx); + if (ctx) + ww_mutex_set_context_fastpath(lock, ctx); return 0; } return __ww_mutex_lock_slowpath(lock, ctx); } -EXPORT_SYMBOL(__ww_mutex_lock); +EXPORT_SYMBOL(ww_mutex_lock); int __sched -__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { might_sleep(); if (__mutex_trylock_fast(&lock->base)) { - ww_mutex_set_context_fastpath(lock, ctx); + if (ctx) + ww_mutex_set_context_fastpath(lock, ctx); return 0; } return __ww_mutex_lock_interruptible_slowpath(lock, ctx); } -EXPORT_SYMBOL(__ww_mutex_lock_interruptible); +EXPORT_SYMBOL(ww_mutex_lock_interruptible); #endif diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 4410a4af42a3..6ebc1902f779 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -9,10 +9,6 @@ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define spin_lock_mutex(lock, flags) \ - do { spin_lock(lock); (void)(flags); } while (0) -#define spin_unlock_mutex(lock, flags) \ - do { spin_unlock(lock); (void)(flags); } while (0) #define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index ce182599cf2e..883cf1b92d90 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,7 +1,6 @@ #include <linux/atomic.h> #include <linux/rwsem.h> #include <linux/percpu.h> -#include <linux/wait.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/rcupdate.h> @@ -18,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); __init_rwsem(&sem->rw_sem, name, rwsem_key); - init_waitqueue_head(&sem->writer); + rcuwait_init(&sem->writer); sem->readers_block = 0; return 0; } @@ -103,7 +102,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem) __this_cpu_dec(*sem->read_count); /* Prod writer to recheck readers_active */ - wake_up(&sem->writer); + rcuwait_wake_up(&sem->writer); } EXPORT_SYMBOL_GPL(__percpu_up_read); @@ -160,7 +159,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem) */ /* Wait for all now active readers to complete. */ - wait_event(sem->writer, readers_active_check(sem)); + rcuwait_wait_event(&sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e3b5520005db..e6b2f7ad3e51 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -263,7 +263,7 @@ pv_wait_early(struct pv_node *prev, int loop) if ((loop & PV_PREV_CHECK_MASK) != 0) return false; - return READ_ONCE(prev->state) != vcpu_running; + return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu); } /* diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index e852be4851fc..4a30ef63c607 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -63,6 +63,7 @@ enum qlock_stats { */ #include <linux/debugfs.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/fs.h> static const char * const qstat_names[qstat_num + 1] = { diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 62b6cee8ea7f..58e366ad36f4 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -18,6 +18,7 @@ */ #include <linux/sched.h> #include <linux/sched/rt.h> +#include <linux/sched/debug.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -101,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) return; } - printk("\n============================================\n"); - printk( "[ BUG: circular locking deadlock detected! ]\n"); - printk("%s\n", print_tainted()); - printk( "--------------------------------------------\n"); + pr_warn("\n"); + pr_warn("============================================\n"); + pr_warn("WARNING: circular locking deadlock detected!\n"); + pr_warn("%s\n", print_tainted()); + pr_warn("--------------------------------------------\n"); printk("%s/%d is deadlocking current task %s/%d\n\n", task->comm, task_pid_nr(task), current->comm, task_pid_nr(current)); @@ -173,12 +175,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) lock->name = name; } -void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) -{ -} - -void rt_mutex_deadlock_account_unlock(struct task_struct *task) -{ -} - diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index d0519c3432b6..b585af9a1b50 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -9,9 +9,6 @@ * This file contains macros used solely by rtmutex.c. Debug version. */ -extern void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2f443ed2320a..b95509416909 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -12,9 +12,11 @@ */ #include <linux/spinlock.h> #include <linux/export.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> +#include <linux/sched/wake_q.h> +#include <linux/sched/debug.h> #include <linux/timer.h> #include "rtmutex_common.h" @@ -222,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #endif +/* + * Only use with rt_mutex_waiter_{less,equal}() + */ +#define task_to_waiter(p) \ + &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } + static inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, struct rt_mutex_waiter *right) @@ -236,12 +244,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, * then right waiter has a dl_prio() too. */ if (dl_prio(left->prio)) - return dl_time_before(left->task->dl.deadline, - right->task->dl.deadline); + return dl_time_before(left->deadline, right->deadline); return 0; } +static inline int +rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) +{ + if (left->prio != right->prio) + return 0; + + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. + * If left waiter has a dl_prio(), and we didn't return 0 above, + * then right waiter has a dl_prio() too. + */ + if (dl_prio(left->prio)) + return left->deadline == right->deadline; + + return 1; +} + static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { @@ -320,72 +346,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) RB_CLEAR_NODE(&waiter->pi_tree_entry); } -/* - * Calculate task priority from the waiter tree priority - * - * Return task->normal_prio when the waiter tree is empty or when - * the waiter is not allowed to do priority boosting - */ -int rt_mutex_getprio(struct task_struct *task) -{ - if (likely(!task_has_pi_waiters(task))) - return task->normal_prio; - - return min(task_top_pi_waiter(task)->prio, - task->normal_prio); -} - -struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +static void rt_mutex_adjust_prio(struct task_struct *p) { - if (likely(!task_has_pi_waiters(task))) - return NULL; - - return task_top_pi_waiter(task)->task; -} + struct task_struct *pi_task = NULL; -/* - * Called by sched_setscheduler() to get the priority which will be - * effective after the change. - */ -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio) -{ - if (!task_has_pi_waiters(task)) - return newprio; + lockdep_assert_held(&p->pi_lock); - if (task_top_pi_waiter(task)->task->prio <= newprio) - return task_top_pi_waiter(task)->task->prio; - return newprio; -} + if (task_has_pi_waiters(p)) + pi_task = task_top_pi_waiter(p)->task; -/* - * Adjust the priority of a task, after its pi_waiters got modified. - * - * This can be both boosting and unboosting. task->pi_lock must be held. - */ -static void __rt_mutex_adjust_prio(struct task_struct *task) -{ - int prio = rt_mutex_getprio(task); - - if (task->prio != prio || dl_prio(prio)) - rt_mutex_setprio(task, prio); -} - -/* - * Adjust task priority (undo boosting). Called from the exit path of - * rt_mutex_slowunlock() and rt_mutex_slowlock(). - * - * (Note: We do this outside of the protection of lock->wait_lock to - * allow the lock to be taken while or before we readjust the priority - * of task. We do not use the spin_xx_mutex() variants here as we are - * outside of the debug path.) - */ -void rt_mutex_adjust_prio(struct task_struct *task) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + rt_mutex_setprio(p, pi_task); } /* @@ -608,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * enabled we continue, but stop the requeueing in the chain * walk. */ - if (waiter->prio == task->prio) { + if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { if (!detect_deadlock) goto out_unlock_pi; else @@ -704,7 +674,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* [7] Requeue the waiter in the lock waiter tree. */ rt_mutex_dequeue(lock, waiter); + + /* + * Update the waiter prio fields now that we're dequeued. + * + * These values can have changed through either: + * + * sys_sched_set_scheduler() / sys_sched_setattr() + * + * or + * + * DL CBS enforcement advancing the effective deadline. + * + * Even though pi_waiters also uses these fields, and that tree is only + * updated in [11], we can do this here, since we hold [L], which + * serializes all pi_waiters access and rb_erase() does not care about + * the values of the node being removed. + */ waiter->prio = task->prio; + waiter->deadline = task->dl.deadline; + rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ @@ -745,7 +734,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); rt_mutex_enqueue_pi(task, waiter); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); } else if (prerequeue_top_waiter == waiter) { /* @@ -761,7 +750,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); rt_mutex_enqueue_pi(task, waiter); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); } else { /* * Nothing changed. No need to do any priority @@ -831,6 +820,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { + lockdep_assert_held(&lock->wait_lock); + /* * Before testing whether we can acquire @lock, we set the * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all @@ -890,7 +881,8 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * the top waiter priority (kernel view), * @task lost. */ - if (task->prio >= rt_mutex_top_waiter(lock)->prio) + if (!rt_mutex_waiter_less(task_to_waiter(task), + rt_mutex_top_waiter(lock))) return 0; /* @@ -936,8 +928,6 @@ takeit: */ rt_mutex_set_owner(lock, task); - rt_mutex_deadlock_account_lock(lock, task); - return 1; } @@ -958,6 +948,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex *next_lock; int chain_walk = 0, res; + lockdep_assert_held(&lock->wait_lock); + /* * Early deadlock detection. We really don't want the task to * enqueue on itself just to untangle the mess later. It's not @@ -971,10 +963,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, return -EDEADLK; raw_spin_lock(&task->pi_lock); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; + waiter->deadline = task->dl.deadline; /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -993,7 +986,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); - __rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { @@ -1045,12 +1038,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, waiter = rt_mutex_top_waiter(lock); /* - * Remove it from current->pi_waiters. We do not adjust a - * possible priority boost right now. We execute wakeup in the - * boosted mode and go back to normal after releasing - * lock->wait_lock. + * Remove it from current->pi_waiters and deboost. + * + * We must in fact deboost here in order to ensure we call + * rt_mutex_setprio() to update p->pi_top_task before the + * task unblocks. */ rt_mutex_dequeue_pi(current, waiter); + rt_mutex_adjust_prio(current); /* * As we are waking up the top waiter, and the waiter stays @@ -1062,9 +1057,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock(¤t->pi_lock); - + /* + * We deboosted before waking the top waiter task such that we don't + * run two tasks with the 'same' priority (and ensure the + * p->pi_top_task pointer points to a blocked task). This however can + * lead to priority inversion if we would get preempted after the + * deboost but before waking our donor task, hence the preempt_disable() + * before unlock. + * + * Pairs with preempt_enable() in rt_mutex_postunlock(); + */ + preempt_disable(); wake_q_add(wake_q, waiter->task); + raw_spin_unlock(¤t->pi_lock); } /* @@ -1080,6 +1085,8 @@ static void remove_waiter(struct rt_mutex *lock, struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex *next_lock; + lockdep_assert_held(&lock->wait_lock); + raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; @@ -1099,7 +1106,7 @@ static void remove_waiter(struct rt_mutex *lock, if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - __rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(owner); /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); @@ -1138,8 +1145,7 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || (waiter->prio == task->prio && - !dl_prio(task->prio))) { + if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } @@ -1153,6 +1159,14 @@ void rt_mutex_adjust_pi(struct task_struct *task) next_lock, NULL, task); } +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree_entry); + RB_CLEAR_NODE(&waiter->tree_entry); + waiter->task = NULL; +} + /** * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take @@ -1179,7 +1193,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, * TASK_INTERRUPTIBLE checks for signals and * timeout. Ignored otherwise. */ - if (unlikely(state == TASK_INTERRUPTIBLE)) { + if (likely(state == TASK_INTERRUPTIBLE)) { /* Signal pending? */ if (signal_pending(current)) ret = -EINTR; @@ -1235,9 +1249,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, unsigned long flags; int ret = 0; - debug_rt_mutex_init_waiter(&waiter); - RB_CLEAR_NODE(&waiter.pi_tree_entry); - RB_CLEAR_NODE(&waiter.tree_entry); + rt_mutex_init_waiter(&waiter); /* * Technically we could use raw_spin_[un]lock_irq() here, but this can @@ -1328,7 +1340,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) /* * Slow path to release a rt-mutex. - * Return whether the current task needs to undo a potential priority boosting. + * + * Return whether the current task needs to call rt_mutex_postunlock(). */ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, struct wake_q_head *wake_q) @@ -1340,8 +1353,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, debug_rt_mutex_unlock(lock); - rt_mutex_deadlock_account_unlock(current); - /* * We must be careful here if the fast path is enabled. If we * have no waiters queued we cannot set owner to NULL here @@ -1388,11 +1399,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, * Queue the next waiter for wakeup once we release the wait_lock. */ mark_wakeup_next_waiter(wake_q, lock); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - /* check PI boosting */ - return true; + return true; /* call rt_mutex_postunlock() */ } /* @@ -1407,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, enum rtmutex_chainwalk chwalk)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); + + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); } static inline int @@ -1423,24 +1431,33 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk)) { if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, timeout, chwalk); + + return slowfn(lock, state, timeout, chwalk); } static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, int (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 1; - } + return slowfn(lock); } +/* + * Performs the wakeup of the the top-waiter and re-enables preemption. + */ +void rt_mutex_postunlock(struct wake_q_head *wake_q) +{ + wake_up_q(wake_q); + + /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ + preempt_enable(); +} + static inline void rt_mutex_fastunlock(struct rt_mutex *lock, bool (*slowfn)(struct rt_mutex *lock, @@ -1448,18 +1465,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock, { DEFINE_WAKE_Q(wake_q); - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - - } else { - bool deboost = slowfn(lock, &wake_q); - - wake_up_q(&wake_q); + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; - /* Undo pi boosting if necessary: */ - if (deboost) - rt_mutex_adjust_prio(current); - } + if (slowfn(lock, &wake_q)) + rt_mutex_postunlock(&wake_q); } /** @@ -1493,16 +1503,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /* - * Futex variant with full deadlock detection. + * Futex variant, must not use fastpath. */ -int rt_mutex_timed_futex_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *timeout) +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) { - might_sleep(); - - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - RT_MUTEX_FULL_CHAINWALK, - rt_mutex_slowlock); + return rt_mutex_slowtrylock(lock); } /** @@ -1561,20 +1566,43 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_unlock); /** - * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock - * @lock: the rt_mutex to be unlocked - * - * Returns: true/false indicating whether priority adjustment is - * required or not. + * Futex variant, that since futex variants do not use the fast-path, can be + * simple and will not need to retry. */ -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh) +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wake_q) { - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - return false; + lockdep_assert_held(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + return false; /* done */ } - return rt_mutex_slowunlock(lock, wqh); + + /* + * We've already deboosted, mark_wakeup_next_waiter() will + * retain preempt_disabled when we drop the wait_lock, to + * avoid inversion prior to the wakeup. preempt_disable() + * therein pairs with rt_mutex_postunlock(). + */ + mark_wakeup_next_waiter(wake_q, lock); + + return true; /* call postunlock() */ +} + +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +{ + DEFINE_WAKE_Q(wake_q); + bool postunlock; + + raw_spin_lock_irq(&lock->wait_lock); + postunlock = __rt_mutex_futex_unlock(lock, &wake_q); + raw_spin_unlock_irq(&lock->wait_lock); + + if (postunlock) + rt_mutex_postunlock(&wake_q); } /** @@ -1635,7 +1663,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, __rt_mutex_init(lock, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); - rt_mutex_deadlock_account_lock(lock, proxy_owner); } /** @@ -1655,34 +1682,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); - rt_mutex_deadlock_account_unlock(proxy_owner); } -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for FUTEX_REQUEUE_PI support. - */ -int rt_mutex_start_proxy_lock(struct rt_mutex *lock, +int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { int ret; - raw_spin_lock_irq(&lock->wait_lock); - - if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock_irq(&lock->wait_lock); + if (try_to_take_rt_mutex(lock, task, NULL)) return 1; - } /* We enforce deadlock detection for futexes */ ret = task_blocks_on_rt_mutex(lock, waiter, task, @@ -1701,14 +1710,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (unlikely(ret)) remove_waiter(lock, waiter); - raw_spin_unlock_irq(&lock->wait_lock); - debug_rt_mutex_print_deadlock(waiter); return ret; } /** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** * rt_mutex_next_owner - return the next owner of the lock * * @lock: the rt lock query @@ -1729,21 +1762,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) } /** - * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition * @lock: the rt_mutex we were woken on * @to: the timeout, null if none. hrtimer should already have * been started. * @waiter: the pre-initialized rt_mutex_waiter * - * Complete the lock acquisition started our behalf by another thread. + * Wait for the the lock acquisition started on our behalf by + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call + * rt_mutex_cleanup_proxy_lock(). * * Returns: * 0 - success * <0 - error, one of -EINTR, -ETIMEDOUT * - * Special API call for PI-futex requeue support + * Special API call for PI-futex support */ -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, +int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter) { @@ -1756,8 +1791,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - if (unlikely(ret)) + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition + * @lock: the rt_mutex we were woken on + * @waiter: the pre-initialized rt_mutex_waiter + * + * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). + * + * Unless we acquired the lock; we're still enqueued on the wait-list and can + * in fact still be granted ownership until we're removed. Therefore we can + * find we are in fact the owner and must disregard the + * rt_mutex_wait_proxy_lock() failure. + * + * Returns: + * true - did the cleanup, we done. + * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, + * caller should disregards its return value. + * + * Special API call for PI-futex support + */ +bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) +{ + bool cleanup = false; + + raw_spin_lock_irq(&lock->wait_lock); + /* + * Unless we're the owner; we're still enqueued on the wait_list. + * So check if we became owner, if not, take us off the wait_list. + */ + if (rt_mutex_owner(lock) != current) { remove_waiter(lock, waiter); + fixup_rt_mutex_waiters(lock); + cleanup = true; + } /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might @@ -1767,5 +1839,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, raw_spin_unlock_irq(&lock->wait_lock); - return ret; + return cleanup; } diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index c4060584c407..6607802efa8b 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -11,8 +11,6 @@ */ #define rt_mutex_deadlock_check(l) (0) -#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) -#define rt_mutex_deadlock_account_unlock(l) do { } while (0) #define debug_rt_mutex_init_waiter(w) do { } while (0) #define debug_rt_mutex_free_waiter(w) do { } while (0) #define debug_rt_mutex_lock(l) do { } while (0) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 990134617b4c..72ad45a9a794 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -13,6 +13,7 @@ #define __KERNEL_RTMUTEX_COMMON_H #include <linux/rtmutex.h> +#include <linux/sched/wake_q.h> /* * This is the control structure for tasks blocked on a rt_mutex, @@ -33,6 +34,7 @@ struct rt_mutex_waiter { struct rt_mutex *deadlock_lock; #endif int prio; + u64 deadline; }; /* @@ -102,16 +104,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); -extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter); -extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh); -extern void rt_mutex_adjust_prio(struct task_struct *task); +extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter); +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter); + +extern int rt_mutex_futex_trylock(struct rt_mutex *l); + +extern void rt_mutex_futex_unlock(struct rt_mutex *lock); +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh); + +extern void rt_mutex_postunlock(struct wake_q_head *wake_q); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 1591f6b3539f..c65f7989f850 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -6,7 +6,8 @@ * - Derived also from comments by Linus */ #include <linux/rwsem.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> #include <linux/export.h> enum rwsem_waiter_type { @@ -128,7 +129,6 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem) void __sched __down_read(struct rw_semaphore *sem) { struct rwsem_waiter waiter; - struct task_struct *tsk; unsigned long flags; raw_spin_lock_irqsave(&sem->wait_lock, flags); @@ -140,13 +140,12 @@ void __sched __down_read(struct rw_semaphore *sem) goto out; } - tsk = current; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); /* set up my own style of waitqueue */ - waiter.task = tsk; + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); + get_task_struct(current); list_add_tail(&waiter.list, &sem->wait_list); @@ -158,10 +157,10 @@ void __sched __down_read(struct rw_semaphore *sem) if (!waiter.task) break; schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); } - __set_task_state(tsk, TASK_RUNNING); + __set_current_state(TASK_RUNNING); out: ; } @@ -194,15 +193,13 @@ int __down_read_trylock(struct rw_semaphore *sem) int __sched __down_write_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; - struct task_struct *tsk; unsigned long flags; int ret = 0; raw_spin_lock_irqsave(&sem->wait_lock, flags); /* set up my own style of waitqueue */ - tsk = current; - waiter.task = tsk; + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; list_add_tail(&waiter.list, &sem->wait_list); @@ -216,23 +213,29 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) */ if (sem->count == 0) break; - if (signal_pending_state(state, current)) { - ret = -EINTR; - goto out; - } - set_task_state(tsk, state); + if (signal_pending_state(state, current)) + goto out_nolock; + + set_current_state(state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); raw_spin_lock_irqsave(&sem->wait_lock, flags); } /* got the lock */ sem->count = -1; -out: list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return ret; + +out_nolock: + list_del(&waiter.list); + if (!list_empty(&sem->wait_list)) + __rwsem_do_wake(sem, 1); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return -EINTR; } void __sched __down_write(struct rw_semaphore *sem) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 631506004f9e..34e727f18e49 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -10,10 +10,12 @@ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. */ #include <linux/rwsem.h> -#include <linux/sched.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/sched/signal.h> #include <linux/sched/rt.h> +#include <linux/sched/wake_q.h> +#include <linux/sched/debug.h> #include <linux/osq_lock.h> #include "rwsem.h" @@ -224,10 +226,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; - struct task_struct *tsk = current; DEFINE_WAKE_Q(wake_q); - waiter.task = tsk; + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; raw_spin_lock_irq(&sem->wait_lock); @@ -254,13 +255,13 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* wait to be given the lock */ while (true) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!waiter.task) break; schedule(); } - __set_task_state(tsk, TASK_RUNNING); + __set_current_state(TASK_RUNNING); return sem; } EXPORT_SYMBOL(rwsem_down_read_failed); @@ -503,8 +504,6 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * wake any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS) { - DEFINE_WAKE_Q(wake_q); - __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock @@ -514,6 +513,11 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * for attempting rwsem_try_write_lock(). */ wake_up_q(&wake_q); + + /* + * Reinitialize wake_q after use. + */ + wake_q_init(&wake_q); } } else diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 45ba475d4be3..4d48b1c4870d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -7,6 +7,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> @@ -123,10 +124,8 @@ EXPORT_SYMBOL(up_write); */ void downgrade_write(struct rw_semaphore *sem) { - /* - * lockdep: a downgraded write will live on as a write - * dependency. - */ + lock_downgrade(&sem->dep_map, _RET_IP_); + rwsem_set_reader_owned(sem); __downgrade_write(sem); } diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index b8120abe594b..561acdd39960 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -29,6 +29,7 @@ #include <linux/kernel.h> #include <linux/export.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/semaphore.h> #include <linux/spinlock.h> #include <linux/ftrace.h> @@ -204,19 +205,18 @@ struct semaphore_waiter { static inline int __sched __down_common(struct semaphore *sem, long state, long timeout) { - struct task_struct *task = current; struct semaphore_waiter waiter; list_add_tail(&waiter.list, &sem->wait_list); - waiter.task = task; + waiter.task = current; waiter.up = false; for (;;) { - if (signal_pending_state(state, task)) + if (signal_pending_state(state, current)) goto interrupted; if (unlikely(timeout <= 0)) goto timed_out; - __set_task_state(task, state); + __set_current_state(state); raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->lock); diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index db3ccb1dd614..4b082b5cac9e 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -363,14 +363,6 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_raw_spin_lock_nested); -void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) -{ - __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); -} -EXPORT_SYMBOL(_raw_spin_lock_bh_nested); - unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 0374a596cffa..9aa0fccd5d43 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -103,38 +103,14 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock) lock->owner_cpu = -1; } -static void __spin_lock_debug(raw_spinlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - - for (i = 0; i < loops; i++) { - if (arch_spin_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - spin_dump(lock, "lockup suspected"); -#ifdef CONFIG_SMP - trigger_all_cpu_backtrace(); -#endif - - /* - * The trylock above was causing a livelock. Give the lower level arch - * specific lock code a chance to acquire the lock. We have already - * printed a warning/backtrace at this point. The non-debug arch - * specific code might actually succeed in acquiring the lock. If it is - * not successful, the end-result is the same - there is no forward - * progress. - */ - arch_spin_lock(&lock->raw_lock); -} - +/* + * We are now relying on the NMI watchdog to detect lockup instead of doing + * the detection here with an unfair lock which can cause problem of its own. + */ void do_raw_spin_lock(raw_spinlock_t *lock) { debug_spin_lock_before(lock); - if (unlikely(!arch_spin_trylock(&lock->raw_lock))) - __spin_lock_debug(lock); + arch_spin_lock(&lock->raw_lock); debug_spin_lock_after(lock); } @@ -172,32 +148,6 @@ static void rwlock_bug(rwlock_t *lock, const char *msg) #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) -#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ -static void __read_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_read_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - void do_raw_read_lock(rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); @@ -247,32 +197,6 @@ static inline void debug_write_unlock(rwlock_t *lock) lock->owner_cpu = -1; } -#if 0 /* This can cause lockups */ -static void __write_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_write_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - void do_raw_write_lock(rwlock_t *lock) { debug_write_lock_before(lock); diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c new file mode 100644 index 000000000000..39f56c870051 --- /dev/null +++ b/kernel/locking/test-ww_mutex.c @@ -0,0 +1,645 @@ +/* + * Module-based API test facility for ww_mutexes + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + */ + +#include <linux/kernel.h> + +#include <linux/completion.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/ww_mutex.h> + +static DEFINE_WW_CLASS(ww_class); +struct workqueue_struct *wq; + +struct test_mutex { + struct work_struct work; + struct ww_mutex mutex; + struct completion ready, go, done; + unsigned int flags; +}; + +#define TEST_MTX_SPIN BIT(0) +#define TEST_MTX_TRY BIT(1) +#define TEST_MTX_CTX BIT(2) +#define __TEST_MTX_LAST BIT(3) + +static void test_mutex_work(struct work_struct *work) +{ + struct test_mutex *mtx = container_of(work, typeof(*mtx), work); + + complete(&mtx->ready); + wait_for_completion(&mtx->go); + + if (mtx->flags & TEST_MTX_TRY) { + while (!ww_mutex_trylock(&mtx->mutex)) + cond_resched(); + } else { + ww_mutex_lock(&mtx->mutex, NULL); + } + complete(&mtx->done); + ww_mutex_unlock(&mtx->mutex); +} + +static int __test_mutex(unsigned int flags) +{ +#define TIMEOUT (HZ / 16) + struct test_mutex mtx; + struct ww_acquire_ctx ctx; + int ret; + + ww_mutex_init(&mtx.mutex, &ww_class); + ww_acquire_init(&ctx, &ww_class); + + INIT_WORK_ONSTACK(&mtx.work, test_mutex_work); + init_completion(&mtx.ready); + init_completion(&mtx.go); + init_completion(&mtx.done); + mtx.flags = flags; + + schedule_work(&mtx.work); + + wait_for_completion(&mtx.ready); + ww_mutex_lock(&mtx.mutex, (flags & TEST_MTX_CTX) ? &ctx : NULL); + complete(&mtx.go); + if (flags & TEST_MTX_SPIN) { + unsigned long timeout = jiffies + TIMEOUT; + + ret = 0; + do { + if (completion_done(&mtx.done)) { + ret = -EINVAL; + break; + } + cond_resched(); + } while (time_before(jiffies, timeout)); + } else { + ret = wait_for_completion_timeout(&mtx.done, TIMEOUT); + } + ww_mutex_unlock(&mtx.mutex); + ww_acquire_fini(&ctx); + + if (ret) { + pr_err("%s(flags=%x): mutual exclusion failure\n", + __func__, flags); + ret = -EINVAL; + } + + flush_work(&mtx.work); + destroy_work_on_stack(&mtx.work); + return ret; +#undef TIMEOUT +} + +static int test_mutex(void) +{ + int ret; + int i; + + for (i = 0; i < __TEST_MTX_LAST; i++) { + ret = __test_mutex(i); + if (ret) + return ret; + } + + return 0; +} + +static int test_aa(void) +{ + struct ww_mutex mutex; + struct ww_acquire_ctx ctx; + int ret; + + ww_mutex_init(&mutex, &ww_class); + ww_acquire_init(&ctx, &ww_class); + + ww_mutex_lock(&mutex, &ctx); + + if (ww_mutex_trylock(&mutex)) { + pr_err("%s: trylocked itself!\n", __func__); + ww_mutex_unlock(&mutex); + ret = -EINVAL; + goto out; + } + + ret = ww_mutex_lock(&mutex, &ctx); + if (ret != -EALREADY) { + pr_err("%s: missed deadlock for recursing, ret=%d\n", + __func__, ret); + if (!ret) + ww_mutex_unlock(&mutex); + ret = -EINVAL; + goto out; + } + + ret = 0; +out: + ww_mutex_unlock(&mutex); + ww_acquire_fini(&ctx); + return ret; +} + +struct test_abba { + struct work_struct work; + struct ww_mutex a_mutex; + struct ww_mutex b_mutex; + struct completion a_ready; + struct completion b_ready; + bool resolve; + int result; +}; + +static void test_abba_work(struct work_struct *work) +{ + struct test_abba *abba = container_of(work, typeof(*abba), work); + struct ww_acquire_ctx ctx; + int err; + + ww_acquire_init(&ctx, &ww_class); + ww_mutex_lock(&abba->b_mutex, &ctx); + + complete(&abba->b_ready); + wait_for_completion(&abba->a_ready); + + err = ww_mutex_lock(&abba->a_mutex, &ctx); + if (abba->resolve && err == -EDEADLK) { + ww_mutex_unlock(&abba->b_mutex); + ww_mutex_lock_slow(&abba->a_mutex, &ctx); + err = ww_mutex_lock(&abba->b_mutex, &ctx); + } + + if (!err) + ww_mutex_unlock(&abba->a_mutex); + ww_mutex_unlock(&abba->b_mutex); + ww_acquire_fini(&ctx); + + abba->result = err; +} + +static int test_abba(bool resolve) +{ + struct test_abba abba; + struct ww_acquire_ctx ctx; + int err, ret; + + ww_mutex_init(&abba.a_mutex, &ww_class); + ww_mutex_init(&abba.b_mutex, &ww_class); + INIT_WORK_ONSTACK(&abba.work, test_abba_work); + init_completion(&abba.a_ready); + init_completion(&abba.b_ready); + abba.resolve = resolve; + + schedule_work(&abba.work); + + ww_acquire_init(&ctx, &ww_class); + ww_mutex_lock(&abba.a_mutex, &ctx); + + complete(&abba.a_ready); + wait_for_completion(&abba.b_ready); + + err = ww_mutex_lock(&abba.b_mutex, &ctx); + if (resolve && err == -EDEADLK) { + ww_mutex_unlock(&abba.a_mutex); + ww_mutex_lock_slow(&abba.b_mutex, &ctx); + err = ww_mutex_lock(&abba.a_mutex, &ctx); + } + + if (!err) + ww_mutex_unlock(&abba.b_mutex); + ww_mutex_unlock(&abba.a_mutex); + ww_acquire_fini(&ctx); + + flush_work(&abba.work); + destroy_work_on_stack(&abba.work); + + ret = 0; + if (resolve) { + if (err || abba.result) { + pr_err("%s: failed to resolve ABBA deadlock, A err=%d, B err=%d\n", + __func__, err, abba.result); + ret = -EINVAL; + } + } else { + if (err != -EDEADLK && abba.result != -EDEADLK) { + pr_err("%s: missed ABBA deadlock, A err=%d, B err=%d\n", + __func__, err, abba.result); + ret = -EINVAL; + } + } + return ret; +} + +struct test_cycle { + struct work_struct work; + struct ww_mutex a_mutex; + struct ww_mutex *b_mutex; + struct completion *a_signal; + struct completion b_signal; + int result; +}; + +static void test_cycle_work(struct work_struct *work) +{ + struct test_cycle *cycle = container_of(work, typeof(*cycle), work); + struct ww_acquire_ctx ctx; + int err; + + ww_acquire_init(&ctx, &ww_class); + ww_mutex_lock(&cycle->a_mutex, &ctx); + + complete(cycle->a_signal); + wait_for_completion(&cycle->b_signal); + + err = ww_mutex_lock(cycle->b_mutex, &ctx); + if (err == -EDEADLK) { + ww_mutex_unlock(&cycle->a_mutex); + ww_mutex_lock_slow(cycle->b_mutex, &ctx); + err = ww_mutex_lock(&cycle->a_mutex, &ctx); + } + + if (!err) + ww_mutex_unlock(cycle->b_mutex); + ww_mutex_unlock(&cycle->a_mutex); + ww_acquire_fini(&ctx); + + cycle->result = err; +} + +static int __test_cycle(unsigned int nthreads) +{ + struct test_cycle *cycles; + unsigned int n, last = nthreads - 1; + int ret; + + cycles = kmalloc_array(nthreads, sizeof(*cycles), GFP_KERNEL); + if (!cycles) + return -ENOMEM; + + for (n = 0; n < nthreads; n++) { + struct test_cycle *cycle = &cycles[n]; + + ww_mutex_init(&cycle->a_mutex, &ww_class); + if (n == last) + cycle->b_mutex = &cycles[0].a_mutex; + else + cycle->b_mutex = &cycles[n + 1].a_mutex; + + if (n == 0) + cycle->a_signal = &cycles[last].b_signal; + else + cycle->a_signal = &cycles[n - 1].b_signal; + init_completion(&cycle->b_signal); + + INIT_WORK(&cycle->work, test_cycle_work); + cycle->result = 0; + } + + for (n = 0; n < nthreads; n++) + queue_work(wq, &cycles[n].work); + + flush_workqueue(wq); + + ret = 0; + for (n = 0; n < nthreads; n++) { + struct test_cycle *cycle = &cycles[n]; + + if (!cycle->result) + continue; + + pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n", + n, nthreads, cycle->result); + ret = -EINVAL; + break; + } + + for (n = 0; n < nthreads; n++) + ww_mutex_destroy(&cycles[n].a_mutex); + kfree(cycles); + return ret; +} + +static int test_cycle(unsigned int ncpus) +{ + unsigned int n; + int ret; + + for (n = 2; n <= ncpus + 1; n++) { + ret = __test_cycle(n); + if (ret) + return ret; + } + + return 0; +} + +struct stress { + struct work_struct work; + struct ww_mutex *locks; + unsigned long timeout; + int nlocks; +}; + +static int *get_random_order(int count) +{ + int *order; + int n, r, tmp; + + order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY); + if (!order) + return order; + + for (n = 0; n < count; n++) + order[n] = n; + + for (n = count - 1; n > 1; n--) { + r = get_random_int() % (n + 1); + if (r != n) { + tmp = order[n]; + order[n] = order[r]; + order[r] = tmp; + } + } + + return order; +} + +static void dummy_load(struct stress *stress) +{ + usleep_range(1000, 2000); +} + +static void stress_inorder_work(struct work_struct *work) +{ + struct stress *stress = container_of(work, typeof(*stress), work); + const int nlocks = stress->nlocks; + struct ww_mutex *locks = stress->locks; + struct ww_acquire_ctx ctx; + int *order; + + order = get_random_order(nlocks); + if (!order) + return; + + do { + int contended = -1; + int n, err; + + ww_acquire_init(&ctx, &ww_class); +retry: + err = 0; + for (n = 0; n < nlocks; n++) { + if (n == contended) + continue; + + err = ww_mutex_lock(&locks[order[n]], &ctx); + if (err < 0) + break; + } + if (!err) + dummy_load(stress); + + if (contended > n) + ww_mutex_unlock(&locks[order[contended]]); + contended = n; + while (n--) + ww_mutex_unlock(&locks[order[n]]); + + if (err == -EDEADLK) { + ww_mutex_lock_slow(&locks[order[contended]], &ctx); + goto retry; + } + + if (err) { + pr_err_once("stress (%s) failed with %d\n", + __func__, err); + break; + } + + ww_acquire_fini(&ctx); + } while (!time_after(jiffies, stress->timeout)); + + kfree(order); + kfree(stress); +} + +struct reorder_lock { + struct list_head link; + struct ww_mutex *lock; +}; + +static void stress_reorder_work(struct work_struct *work) +{ + struct stress *stress = container_of(work, typeof(*stress), work); + LIST_HEAD(locks); + struct ww_acquire_ctx ctx; + struct reorder_lock *ll, *ln; + int *order; + int n, err; + + order = get_random_order(stress->nlocks); + if (!order) + return; + + for (n = 0; n < stress->nlocks; n++) { + ll = kmalloc(sizeof(*ll), GFP_KERNEL); + if (!ll) + goto out; + + ll->lock = &stress->locks[order[n]]; + list_add(&ll->link, &locks); + } + kfree(order); + order = NULL; + + do { + ww_acquire_init(&ctx, &ww_class); + + list_for_each_entry(ll, &locks, link) { + err = ww_mutex_lock(ll->lock, &ctx); + if (!err) + continue; + + ln = ll; + list_for_each_entry_continue_reverse(ln, &locks, link) + ww_mutex_unlock(ln->lock); + + if (err != -EDEADLK) { + pr_err_once("stress (%s) failed with %d\n", + __func__, err); + break; + } + + ww_mutex_lock_slow(ll->lock, &ctx); + list_move(&ll->link, &locks); /* restarts iteration */ + } + + dummy_load(stress); + list_for_each_entry(ll, &locks, link) + ww_mutex_unlock(ll->lock); + + ww_acquire_fini(&ctx); + } while (!time_after(jiffies, stress->timeout)); + +out: + list_for_each_entry_safe(ll, ln, &locks, link) + kfree(ll); + kfree(order); + kfree(stress); +} + +static void stress_one_work(struct work_struct *work) +{ + struct stress *stress = container_of(work, typeof(*stress), work); + const int nlocks = stress->nlocks; + struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks); + int err; + + do { + err = ww_mutex_lock(lock, NULL); + if (!err) { + dummy_load(stress); + ww_mutex_unlock(lock); + } else { + pr_err_once("stress (%s) failed with %d\n", + __func__, err); + break; + } + } while (!time_after(jiffies, stress->timeout)); + + kfree(stress); +} + +#define STRESS_INORDER BIT(0) +#define STRESS_REORDER BIT(1) +#define STRESS_ONE BIT(2) +#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE) + +static int stress(int nlocks, int nthreads, unsigned int flags) +{ + struct ww_mutex *locks; + int n; + + locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL); + if (!locks) + return -ENOMEM; + + for (n = 0; n < nlocks; n++) + ww_mutex_init(&locks[n], &ww_class); + + for (n = 0; nthreads; n++) { + struct stress *stress; + void (*fn)(struct work_struct *work); + + fn = NULL; + switch (n & 3) { + case 0: + if (flags & STRESS_INORDER) + fn = stress_inorder_work; + break; + case 1: + if (flags & STRESS_REORDER) + fn = stress_reorder_work; + break; + case 2: + if (flags & STRESS_ONE) + fn = stress_one_work; + break; + } + + if (!fn) + continue; + + stress = kmalloc(sizeof(*stress), GFP_KERNEL); + if (!stress) + break; + + INIT_WORK(&stress->work, fn); + stress->locks = locks; + stress->nlocks = nlocks; + stress->timeout = jiffies + 2*HZ; + + queue_work(wq, &stress->work); + nthreads--; + } + + flush_workqueue(wq); + + for (n = 0; n < nlocks; n++) + ww_mutex_destroy(&locks[n]); + kfree(locks); + + return 0; +} + +static int __init test_ww_mutex_init(void) +{ + int ncpus = num_online_cpus(); + int ret; + + wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); + if (!wq) + return -ENOMEM; + + ret = test_mutex(); + if (ret) + return ret; + + ret = test_aa(); + if (ret) + return ret; + + ret = test_abba(false); + if (ret) + return ret; + + ret = test_abba(true); + if (ret) + return ret; + + ret = test_cycle(ncpus); + if (ret) + return ret; + + ret = stress(16, 2*ncpus, STRESS_INORDER); + if (ret) + return ret; + + ret = stress(16, 2*ncpus, STRESS_REORDER); + if (ret) + return ret; + + ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); + if (ret) + return ret; + + return 0; +} + +static void __exit test_ww_mutex_exit(void) +{ + destroy_workqueue(wq); +} + +module_init(test_ww_mutex_init); +module_exit(test_ww_mutex_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/kernel/membarrier.c b/kernel/membarrier.c index 536c727a56e9..9f9284f37f8d 100644 --- a/kernel/membarrier.c +++ b/kernel/membarrier.c @@ -16,6 +16,7 @@ #include <linux/syscalls.h> #include <linux/membarrier.h> +#include <linux/tick.h> /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, @@ -51,6 +52,9 @@ */ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) { + /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + if (tick_nohz_full_enabled()) + return -ENOSYS; if (unlikely(flags)) return -EINVAL; switch (cmd) { diff --git a/kernel/memremap.c b/kernel/memremap.c index 9ecedc28b928..23a6483c3666 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -182,18 +182,6 @@ struct page_map { struct vmem_altmap altmap; }; -void get_zone_device_page(struct page *page) -{ - percpu_ref_get(page->pgmap->ref); -} -EXPORT_SYMBOL(get_zone_device_page); - -void put_zone_device_page(struct page *page) -{ - put_dev_pagemap(page->pgmap); -} -EXPORT_SYMBOL(put_zone_device_page); - static void pgmap_radix_release(struct resource *res) { resource_size_t key, align_start, align_size, align_end; @@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data) struct resource *res = &page_map->res; resource_size_t align_start, align_size; struct dev_pagemap *pgmap = &page_map->pgmap; + unsigned long pfn; + + for_each_device_pfn(pfn, page_map) + put_page(pfn_to_page(pfn)); if (percpu_ref_tryget_live(pgmap->ref)) { dev_WARN(dev, "%s: page mapping is still live!\n", __func__); @@ -246,9 +238,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data) /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); + mem_hotplug_begin(); arch_remove_memory(align_start, align_size); mem_hotplug_done(); + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, @@ -275,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) * * Notes: * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time - * (or devm release event). + * (or devm release event). The expected order of events is that @ref has + * been through percpu_ref_kill() before devm_memremap_pages_release(). The + * wait for the completion of all references being dropped and + * percpu_ref_exit() must occur after devm_memremap_pages_release(). * * 2/ @res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -377,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, */ list_del(&page->lru); page->pgmap = pgmap; + percpu_ref_get(ref); } devres_add(dev, page_map); return __va(res->start); diff --git a/kernel/module.c b/kernel/module.c index 3d8f126208e3..4a3665f8f837 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -17,6 +17,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/export.h> +#include <linux/extable.h> #include <linux/moduleloader.h> #include <linux/trace_events.h> #include <linux/init.h> @@ -48,6 +49,9 @@ #include <linux/rculist.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> +#ifdef CONFIG_STRICT_MODULE_RWX +#include <asm/set_memory.h> +#endif #include <asm/mmu_context.h> #include <linux/license.h> #include <asm/sections.h> @@ -61,6 +65,7 @@ #include <linux/pfn.h> #include <linux/bsearch.h> #include <linux/dynamic_debug.h> +#include <linux/audit.h> #include <uapi/linux/module.h> #include "module-internal.h" @@ -74,9 +79,9 @@ /* * Modules' sections will be aligned on page boundaries * to ensure complete separation of code and data, but - * only when CONFIG_DEBUG_SET_MODULE_RONX=y + * only when CONFIG_STRICT_MODULE_RWX=y */ -#ifdef CONFIG_DEBUG_SET_MODULE_RONX +#ifdef CONFIG_STRICT_MODULE_RWX # define debug_align(X) ALIGN(X, PAGE_SIZE) #else # define debug_align(X) (X) @@ -663,16 +668,7 @@ static void percpu_modcopy(struct module *mod, memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); } -/** - * is_module_percpu_address - test whether address is from module static percpu - * @addr: address to test - * - * Test whether @addr belongs to module static percpu area. - * - * RETURNS: - * %true if @addr is from module static percpu area - */ -bool is_module_percpu_address(unsigned long addr) +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) { struct module *mod; unsigned int cpu; @@ -686,9 +682,15 @@ bool is_module_percpu_address(unsigned long addr) continue; for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(mod->percpu, cpu); - - if ((void *)addr >= start && - (void *)addr < start + mod->percpu_size) { + void *va = (void *)addr; + + if (va >= start && va < start + mod->percpu_size) { + if (can_addr) { + *can_addr = (unsigned long) (va - start); + *can_addr += (unsigned long) + per_cpu_ptr(mod->percpu, + get_boot_cpu_id()); + } preempt_enable(); return true; } @@ -699,6 +701,20 @@ bool is_module_percpu_address(unsigned long addr) return false; } +/** + * is_module_percpu_address - test whether address is from module static percpu + * @addr: address to test + * + * Test whether @addr belongs to module static percpu area. + * + * RETURNS: + * %true if @addr is from module static percpu area + */ +bool is_module_percpu_address(unsigned long addr) +{ + return __is_module_percpu_address(addr, NULL); +} + #else /* ... !CONFIG_SMP */ static inline void __percpu *mod_percpu(struct module *mod) @@ -730,6 +746,11 @@ bool is_module_percpu_address(unsigned long addr) return false; } +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) +{ + return false; +} + #endif /* CONFIG_SMP */ #define MODINFO_ATTR(field) \ @@ -945,6 +966,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; + audit_log_kern_module(name); + if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; @@ -1844,7 +1867,7 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -#ifdef CONFIG_DEBUG_SET_MODULE_RONX +#ifdef CONFIG_STRICT_MODULE_RWX /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. @@ -2809,6 +2832,8 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info) if (get_modinfo(info, "livepatch")) { mod->klp = true; add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); + pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n", + mod->name); } return 0; @@ -2842,7 +2867,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, /* Suck in entire file: we'll want most of it. */ info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL); + GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); if (!info->hdr) return -ENOMEM; @@ -3608,6 +3633,8 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_copy; } + audit_log_kern_module(mod->name); + /* Reserve our place in the list. */ err = add_unformed_module(mod); if (err) @@ -3696,7 +3723,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod->name, after_dashes); } - /* Link in to syfs. */ + /* Link in to sysfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) goto coming_cleanup; @@ -3719,6 +3746,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod_sysfs_teardown(mod); coming_cleanup: mod->state = MODULE_STATE_GOING; + destroy_params(mod->kp, mod->num_kp); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); klp_module_going(mod); @@ -4010,7 +4038,7 @@ unsigned long module_kallsyms_lookup_name(const char *name) /* Don't lock: we're in enough trouble already. */ preempt_disable(); - if ((colon = strchr(name, ':')) != NULL) { + if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { if ((mod = find_module_all(name, colon - name, false)) != NULL) ret = mod_find_symname(mod, colon+1); } else { @@ -4165,22 +4193,23 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) struct module *mod; preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (mod->num_exentries == 0) - continue; + mod = __module_address(addr); + if (!mod) + goto out; - e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, - addr); - if (e) - break; - } + if (!mod->num_exentries) + goto out; + + e = search_extable(mod->extable, + mod->extable + mod->num_exentries - 1, + addr); +out: preempt_enable(); - /* Now, if we found one, we are running inside it now, hence - we cannot unload the module, hence no refcnt needed. */ + /* + * Now, if we found one, we are running inside it now, hence + * we cannot unload the module, hence no refcnt needed. + */ return e; } diff --git a/kernel/notifier.c b/kernel/notifier.c index fd2c9acbcc19..6196af8a8223 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -95,7 +95,7 @@ static int notifier_call_chain(struct notifier_block **nl, if (nr_calls) (*nr_calls)++; - if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) + if (ret & NOTIFY_STOP_MASK) break; nb = next_nb; nr_to_call--; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 782102e59eed..f6c5d330059a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -26,6 +26,7 @@ #include <linux/file.h> #include <linux/syscalls.h> #include <linux/cgroup.h> +#include <linux/perf_event.h> static struct kmem_cache *nsproxy_cachep; @@ -262,6 +263,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) goto out; } switch_task_namespaces(tsk, new_nsproxy); + + perf_event_namespaces(tsk); out: fput(file); return err; diff --git a/kernel/padata.c b/kernel/padata.c index 05316c9f32da..ac8f1e524836 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -154,8 +154,6 @@ EXPORT_SYMBOL(padata_do_parallel); * A pointer to the control struct of the next object that needs * serialization, if present in one of the percpu reorder queues. * - * NULL, if all percpu reorder queues are empty. - * * -EINPROGRESS, if the next object that needs serialization will * be parallel processed by another cpu and is not yet present in * the cpu's reorder queue. @@ -182,23 +180,22 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) cpu = padata_index_to_cpu(pd, next_index); next_queue = per_cpu_ptr(pd->pqueue, cpu); - padata = NULL; - reorder = &next_queue->reorder; + spin_lock(&reorder->lock); if (!list_empty(&reorder->list)) { padata = list_entry(reorder->list.next, struct padata_priv, list); - spin_lock(&reorder->lock); list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); - spin_unlock(&reorder->lock); pd->processed++; + spin_unlock(&reorder->lock); goto out; } + spin_unlock(&reorder->lock); if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { padata = ERR_PTR(-ENODATA); @@ -234,12 +231,11 @@ static void padata_reorder(struct parallel_data *pd) padata = padata_get_next(pd); /* - * All reorder queues are empty, or the next object that needs - * serialization is parallel processed by another cpu and is - * still on it's way to the cpu's reorder queue, nothing to - * do for now. + * If the next object that needs serialization is parallel + * processed by another cpu and is still on it's way to the + * cpu's reorder queue, nothing to do for now. */ - if (!padata || PTR_ERR(padata) == -EINPROGRESS) + if (PTR_ERR(padata) == -EINPROGRESS) break; /* @@ -353,7 +349,7 @@ static int padata_setup_cpumasks(struct parallel_data *pd, cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { - free_cpumask_var(pd->cpumask.cbcpu); + free_cpumask_var(pd->cpumask.pcpu); return -ENOMEM; } diff --git a/kernel/panic.c b/kernel/panic.c index 08aa88dde7de..a58932b41700 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -9,6 +9,7 @@ * to indicate a major problem. */ #include <linux/debug_locks.h> +#include <linux/sched/debug.h> #include <linux/interrupt.h> #include <linux/kmsg_dump.h> #include <linux/kallsyms.h> @@ -188,7 +189,7 @@ void panic(const char *fmt, ...) * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!_crash_kexec_post_notifiers) { - printk_nmi_flush_on_panic(); + printk_safe_flush_on_panic(); __crash_kexec(NULL); /* @@ -213,7 +214,7 @@ void panic(const char *fmt, ...) atomic_notifier_call_chain(&panic_notifier_list, 0, buf); /* Call flush even twice. It tries harder with a single online CPU */ - printk_nmi_flush_on_panic(); + printk_safe_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* @@ -273,7 +274,8 @@ void panic(const char *fmt, ...) extern int stop_a_enabled; /* Make sure the user can actually press Stop-A (L1-A) */ stop_a_enabled = 1; - pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n"); + pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n" + "twice on console to return to the boot prom\n"); } #endif #if defined(CONFIG_S390) diff --git a/kernel/params.c b/kernel/params.c index a6d6149c0fe6..60b2d8101355 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -160,58 +160,6 @@ static int parse_one(char *param, return -ENOENT; } -/* You can use " around spaces, but can't escape ". */ -/* Hyphens and underscores equivalent in parameter names. */ -static char *next_arg(char *args, char **param, char **val) -{ - unsigned int i, equals = 0; - int in_quote = 0, quoted = 0; - char *next; - - if (*args == '"') { - args++; - in_quote = 1; - quoted = 1; - } - - for (i = 0; args[i]; i++) { - if (isspace(args[i]) && !in_quote) - break; - if (equals == 0) { - if (args[i] == '=') - equals = i; - } - if (args[i] == '"') - in_quote = !in_quote; - } - - *param = args; - if (!equals) - *val = NULL; - else { - args[equals] = '\0'; - *val = args + equals + 1; - - /* Don't include quotes in value. */ - if (**val == '"') { - (*val)++; - if (args[i-1] == '"') - args[i-1] = '\0'; - } - } - if (quoted && args[i-1] == '"') - args[i-1] = '\0'; - - if (args[i]) { - args[i] = '\0'; - next = args + i + 1; - } else - next = args + i; - - /* Chew up trailing spaces. */ - return skip_spaces(next); -} - /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ char *parse_args(const char *doing, char *args, diff --git a/kernel/pid.c b/kernel/pid.c index f66162f2359b..fd1cde1e4576 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -38,6 +38,7 @@ #include <linux/syscalls.h> #include <linux/proc_ns.h> #include <linux/proc_fs.h> +#include <linux/sched/task.h> #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -68,9 +69,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns, * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, + .kref = KREF_INIT(2), .pidmap = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, @@ -322,8 +321,10 @@ struct pid *alloc_pid(struct pid_namespace *ns) } if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) + if (pid_ns_prepare_proc(ns)) { + disable_pid_allocation(ns); goto out_free; + } } get_pid_ns(ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index eef2ce968636..d1f3e9f558b8 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -12,12 +12,15 @@ #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/syscalls.h> +#include <linux/cred.h> #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> #include <linux/proc_ns.h> #include <linux/reboot.h> #include <linux/export.h> +#include <linux/sched/task.h> +#include <linux/sched/signal.h> struct pid_cache { int nr_ids; @@ -371,6 +374,29 @@ static struct ns_common *pidns_get(struct task_struct *task) return ns ? &ns->ns : NULL; } +static struct ns_common *pidns_for_children_get(struct task_struct *task) +{ + struct pid_namespace *ns = NULL; + + task_lock(task); + if (task->nsproxy) { + ns = task->nsproxy->pid_ns_for_children; + get_pid_ns(ns); + } + task_unlock(task); + + if (ns) { + read_lock(&tasklist_lock); + if (!ns->child_reaper) { + put_pid_ns(ns); + ns = NULL; + } + read_unlock(&tasklist_lock); + } + + return ns ? &ns->ns : NULL; +} + static void pidns_put(struct ns_common *ns) { put_pid_ns(to_pid_ns(ns)); @@ -440,6 +466,17 @@ const struct proc_ns_operations pidns_operations = { .get_parent = pidns_get_parent, }; +const struct proc_ns_operations pidns_for_children_operations = { + .name = "pid_for_children", + .real_ns_name = "pid", + .type = CLONE_NEWPID, + .get = pidns_for_children_get, + .put = pidns_put, + .install = pidns_install, + .owner = pidns_owner, + .get_parent = pidns_get_parent, +}; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b26dbc48c75b..a8b978c35a6a 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -10,6 +10,8 @@ * This file is released under the GPLv2. */ +#define pr_fmt(fmt) "PM: " fmt + #include <linux/export.h> #include <linux/suspend.h> #include <linux/syscalls.h> @@ -21,6 +23,7 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/pm.h> +#include <linux/nmi.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> @@ -104,7 +107,7 @@ EXPORT_SYMBOL(system_entering_hibernation); #ifdef CONFIG_PM_DEBUG static void hibernation_debug_sleep(void) { - printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n"); + pr_info("hibernation debug: Waiting for 5 seconds.\n"); mdelay(5000); } @@ -250,10 +253,9 @@ void swsusp_show_speed(ktime_t start, ktime_t stop, centisecs = 1; /* avoid div-by-zero */ k = nr_pages * (PAGE_SIZE / 1024); kps = (k * 100) / centisecs; - printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", - msg, k, - centisecs / 100, centisecs % 100, - kps / 1000, (kps % 1000) / 10); + pr_info("%s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", + msg, k, centisecs / 100, centisecs % 100, kps / 1000, + (kps % 1000) / 10); } /** @@ -271,8 +273,7 @@ static int create_image(int platform_mode) error = dpm_suspend_end(PMSG_FREEZE); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting hibernation\n"); + pr_err("Some devices failed to power down, aborting hibernation\n"); return error; } @@ -288,8 +289,7 @@ static int create_image(int platform_mode) error = syscore_suspend(); if (error) { - printk(KERN_ERR "PM: Some system devices failed to power down, " - "aborting hibernation\n"); + pr_err("Some system devices failed to power down, aborting hibernation\n"); goto Enable_irqs; } @@ -304,8 +304,8 @@ static int create_image(int platform_mode) restore_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) - printk(KERN_ERR "PM: Error %d creating hibernation image\n", - error); + pr_err("Error %d creating hibernation image\n", error); + if (!in_suspend) { events_check_enabled = false; clear_free_pages(); @@ -432,8 +432,7 @@ static int resume_target_kernel(bool platform_mode) error = dpm_suspend_end(PMSG_QUIESCE); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting resume\n"); + pr_err("Some devices failed to power down, aborting resume\n"); return error; } @@ -608,6 +607,22 @@ static void power_down(void) { #ifdef CONFIG_SUSPEND int error; + + if (hibernation_mode == HIBERNATION_SUSPEND) { + error = suspend_devices_and_enter(PM_SUSPEND_MEM); + if (error) { + hibernation_mode = hibernation_ops ? + HIBERNATION_PLATFORM : + HIBERNATION_SHUTDOWN; + } else { + /* Restore swap signature. */ + error = swsusp_unmark(); + if (error) + pr_err("Swap will be unusable! Try swapon -a.\n"); + + return; + } + } #endif switch (hibernation_mode) { @@ -620,32 +635,13 @@ static void power_down(void) if (pm_power_off) kernel_power_off(); break; -#ifdef CONFIG_SUSPEND - case HIBERNATION_SUSPEND: - error = suspend_devices_and_enter(PM_SUSPEND_MEM); - if (error) { - if (hibernation_ops) - hibernation_mode = HIBERNATION_PLATFORM; - else - hibernation_mode = HIBERNATION_SHUTDOWN; - power_down(); - } - /* - * Restore swap signature. - */ - error = swsusp_unmark(); - if (error) - printk(KERN_ERR "PM: Swap will be unusable! " - "Try swapon -a.\n"); - return; -#endif } kernel_halt(); /* * Valid image is on the disk, if we continue we risk serious data * corruption after resume. */ - printk(KERN_CRIT "PM: Please power down manually\n"); + pr_crit("Power down manually\n"); while (1) cpu_relax(); } @@ -655,7 +651,7 @@ static int load_image_and_restore(void) int error; unsigned int flags; - pr_debug("PM: Loading hibernation image.\n"); + pr_debug("Loading hibernation image.\n"); lock_device_hotplug(); error = create_basic_memory_bitmaps(); @@ -667,7 +663,7 @@ static int load_image_and_restore(void) if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); - printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); + pr_err("Failed to load hibernation image, recovering.\n"); swsusp_free(); free_basic_memory_bitmaps(); Unlock: @@ -685,7 +681,7 @@ int hibernate(void) bool snapshot_test = false; if (!hibernation_available()) { - pr_debug("PM: Hibernation not available.\n"); + pr_debug("Hibernation not available.\n"); return -EPERM; } @@ -703,9 +699,9 @@ int hibernate(void) goto Exit; } - printk(KERN_INFO "PM: Syncing filesystems ... "); + pr_info("Syncing filesystems ... \n"); sys_sync(); - printk("done.\n"); + pr_info("done.\n"); error = freeze_processes(); if (error) @@ -731,7 +727,7 @@ int hibernate(void) else flags |= SF_CRC32_MODE; - pr_debug("PM: writing image.\n"); + pr_debug("Writing image.\n"); error = swsusp_write(flags); swsusp_free(); if (!error) { @@ -743,7 +739,7 @@ int hibernate(void) in_suspend = 0; pm_restore_gfp_mask(); } else { - pr_debug("PM: Image restored successfully.\n"); + pr_debug("Image restored successfully.\n"); } Free_bitmaps: @@ -751,7 +747,7 @@ int hibernate(void) Thaw: unlock_device_hotplug(); if (snapshot_test) { - pr_debug("PM: Checking hibernation image\n"); + pr_debug("Checking hibernation image\n"); error = swsusp_check(); if (!error) error = load_image_and_restore(); @@ -815,10 +811,10 @@ static int software_resume(void) goto Unlock; } - pr_debug("PM: Checking hibernation image partition %s\n", resume_file); + pr_debug("Checking hibernation image partition %s\n", resume_file); if (resume_delay) { - printk(KERN_INFO "Waiting %dsec before reading resume device...\n", + pr_info("Waiting %dsec before reading resume device ...\n", resume_delay); ssleep(resume_delay); } @@ -857,10 +853,10 @@ static int software_resume(void) } Check_image: - pr_debug("PM: Hibernation image partition %d:%d present\n", + pr_debug("Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); - pr_debug("PM: Looking for hibernation image.\n"); + pr_debug("Looking for hibernation image.\n"); error = swsusp_check(); if (error) goto Unlock; @@ -879,7 +875,7 @@ static int software_resume(void) goto Close_Finish; } - pr_debug("PM: Preparing processes for restore.\n"); + pr_debug("Preparing processes for restore.\n"); error = freeze_processes(); if (error) goto Close_Finish; @@ -892,7 +888,7 @@ static int software_resume(void) /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&pm_mutex); - pr_debug("PM: Hibernation image not present or could not be loaded.\n"); + pr_debug("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: swsusp_close(FMODE_READ); @@ -1016,7 +1012,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, error = -EINVAL; if (!error) - pr_debug("PM: Hibernation mode set to '%s'\n", + pr_debug("Hibernation mode set to '%s'\n", hibernation_modes[mode]); unlock_system_sleep(); return error ? error : n; @@ -1052,7 +1048,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, lock_system_sleep(); swsusp_resume_device = res; unlock_system_sleep(); - printk(KERN_INFO "PM: Starting manual resume from disk\n"); + pr_info("Starting manual resume from disk\n"); noresume = 0; software_resume(); return n; @@ -1156,7 +1152,7 @@ static int __init hibernate_setup(char *str) } else if (!strncmp(str, "no", 2)) { noresume = 1; nohibernate = 1; - } else if (IS_ENABLED(CONFIG_DEBUG_RODATA) + } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && !strncmp(str, "protect_image", 13)) { enable_restore_image_protection(); } diff --git a/kernel/power/power.h b/kernel/power/power.h index 1dfa0da827d3..7fdc40d31b7d 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -61,12 +61,12 @@ extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); -#ifdef CONFIG_DEBUG_RODATA +#ifdef CONFIG_STRICT_KERNEL_RWX /* kernel/power/snapshot.c */ extern void enable_restore_image_protection(void); #else static inline void enable_restore_image_protection(void) {} -#endif /* CONFIG_DEBUG_RODATA */ +#endif /* CONFIG_STRICT_KERNEL_RWX */ #else /* !CONFIG_HIBERNATION */ diff --git a/kernel/power/process.c b/kernel/power/process.c index 2fba066e125f..78672d324a6e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -12,6 +12,8 @@ #include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> +#include <linux/sched/debug.h> +#include <linux/sched/task.h> #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/delay.h> @@ -130,7 +132,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(); + pm_wakeup_clear(true); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 2d8e2b227db8..3b1e0f3ad07f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -22,6 +22,7 @@ #include <linux/device.h> #include <linux/init.h> #include <linux/bootmem.h> +#include <linux/nmi.h> #include <linux/syscalls.h> #include <linux/console.h> #include <linux/highmem.h> @@ -35,10 +36,13 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/io.h> +#ifdef CONFIG_STRICT_KERNEL_RWX +#include <asm/set_memory.h> +#endif #include "power.h" -#ifdef CONFIG_DEBUG_RODATA +#ifdef CONFIG_STRICT_KERNEL_RWX static bool hibernate_restore_protection; static bool hibernate_restore_protection_active; @@ -73,7 +77,7 @@ static inline void hibernate_restore_protection_begin(void) {} static inline void hibernate_restore_protection_end(void) {} static inline void hibernate_restore_protect_page(void *page_address) {} static inline void hibernate_restore_unprotect_page(void *page_address) {} -#endif /* CONFIG_DEBUG_RODATA */ +#endif /* CONFIG_STRICT_KERNEL_RWX */ static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 15e6baef5c73..c0248c74d6d4 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -72,6 +72,8 @@ static void freeze_begin(void) static void freeze_enter(void) { + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true); + spin_lock_irq(&suspend_freeze_lock); if (pm_wakeup_pending()) goto out; @@ -98,6 +100,27 @@ static void freeze_enter(void) out: suspend_freeze_state = FREEZE_STATE_NONE; spin_unlock_irq(&suspend_freeze_lock); + + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false); +} + +static void s2idle_loop(void) +{ + do { + freeze_enter(); + + if (freeze_ops && freeze_ops->wake) + freeze_ops->wake(); + + dpm_resume_noirq(PMSG_RESUME); + if (freeze_ops && freeze_ops->sync) + freeze_ops->sync(); + + if (pm_wakeup_pending()) + break; + + pm_wakeup_clear(false); + } while (!dpm_suspend_noirq(PMSG_SUSPEND)); } void freeze_wake(void) @@ -371,10 +394,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { - trace_suspend_resume(TPS("machine_suspend"), state, true); - freeze_enter(); - trace_suspend_resume(TPS("machine_suspend"), state, false); - goto Platform_wake; + s2idle_loop(); + goto Platform_early_resume; } error = disable_nonboot_cpus(); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index bdff5ed57f10..5db217051232 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -166,7 +166,7 @@ static int __init setup_test_suspend(char *value) return 0; } - for (i = 0; pm_labels[i]; i++) + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (!strcmp(pm_labels[i], suspend_type)) { test_state_label = pm_labels[i]; return 0; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 32e0c232efba..f80fd33639e0 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -201,7 +201,7 @@ void free_all_swap_pages(int swap) struct swsusp_extent *ext; unsigned long offset; - ext = container_of(node, struct swsusp_extent, node); + ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); for (offset = ext->start; offset <= ext->end; offset++) swap_free(swp_entry(swap, offset)); diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index abb0042a427b..4a2ffc39eb95 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,3 +1,3 @@ obj-y = printk.o -obj-$(CONFIG_PRINTK_NMI) += nmi.o +obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index d5760c42f042..61d41ca41844 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -2,12 +2,13 @@ #include <linux/kernel.h> #include <linux/console.h> +#include <linux/errno.h> #include <linux/string.h> #include "console_cmdline.h" #include "braille.h" -char *_braille_console_setup(char **str, char **brl_options) +int _braille_console_setup(char **str, char **brl_options) { if (!strncmp(*str, "brl,", 4)) { *brl_options = ""; @@ -15,14 +16,14 @@ char *_braille_console_setup(char **str, char **brl_options) } else if (!strncmp(*str, "brl=", 4)) { *brl_options = *str + 4; *str = strchr(*brl_options, ','); - if (!*str) + if (!*str) { pr_err("need port name after brl=\n"); - else - *((*str)++) = 0; - } else - return NULL; + return -EINVAL; + } + *((*str)++) = 0; + } - return *str; + return 0; } int diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h index 769d771145c8..749a6756843a 100644 --- a/kernel/printk/braille.h +++ b/kernel/printk/braille.h @@ -9,7 +9,14 @@ braille_set_options(struct console_cmdline *c, char *brl_options) c->brl_options = brl_options; } -char * +/* + * Setup console according to braille options. + * Return -EINVAL on syntax error, 0 on success (or no braille option was + * actually given). + * Modifies str to point to the serial options + * Sets brl_options to the parsed braille options. + */ +int _braille_console_setup(char **str, char **brl_options); int @@ -25,10 +32,10 @@ braille_set_options(struct console_cmdline *c, char *brl_options) { } -static inline char * +static inline int _braille_console_setup(char **str, char **brl_options) { - return NULL; + return 0; } static inline int diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 7fd2838fa417..1db044f808b7 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -16,42 +16,55 @@ */ #include <linux/percpu.h> -typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); +#ifdef CONFIG_PRINTK -int __printf(1, 0) vprintk_default(const char *fmt, va_list args); - -#ifdef CONFIG_PRINTK_NMI +#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff +#define PRINTK_NMI_CONTEXT_MASK 0x80000000 extern raw_spinlock_t logbuf_lock; +__printf(1, 0) int vprintk_default(const char *fmt, va_list args); +__printf(1, 0) int vprintk_func(const char *fmt, va_list args); +void __printk_safe_enter(void); +void __printk_safe_exit(void); + +#define printk_safe_enter_irqsave(flags) \ + do { \ + local_irq_save(flags); \ + __printk_safe_enter(); \ + } while (0) + +#define printk_safe_exit_irqrestore(flags) \ + do { \ + __printk_safe_exit(); \ + local_irq_restore(flags); \ + } while (0) + +#define printk_safe_enter_irq() \ + do { \ + local_irq_disable(); \ + __printk_safe_enter(); \ + } while (0) + +#define printk_safe_exit_irq() \ + do { \ + __printk_safe_exit(); \ + local_irq_enable(); \ + } while (0) + +#else + +__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } + /* - * printk() could not take logbuf_lock in NMI context. Instead, - * it temporary stores the strings into a per-CPU buffer. - * The alternative implementation is chosen transparently - * via per-CPU variable. + * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem + * semaphore and some of console functions (console_unlock()/etc.), so + * printk-safe must preserve the existing local IRQ guarantees. */ -DECLARE_PER_CPU(printk_func_t, printk_func); -static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) -{ - return this_cpu_read(printk_func)(fmt, args); -} - -extern atomic_t nmi_message_lost; -static inline int get_nmi_message_lost(void) -{ - return atomic_xchg(&nmi_message_lost, 0); -} - -#else /* CONFIG_PRINTK_NMI */ - -static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) -{ - return vprintk_default(fmt, args); -} - -static inline int get_nmi_message_lost(void) -{ - return 0; -} - -#endif /* CONFIG_PRINTK_NMI */ +#define printk_safe_enter_irqsave(flags) local_irq_save(flags) +#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) + +#define printk_safe_enter_irq() local_irq_disable() +#define printk_safe_exit_irq() local_irq_enable() + +#endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4ba3d34938c0..a1aecf44ab07 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -32,7 +32,7 @@ #include <linux/bootmem.h> #include <linux/memblock.h> #include <linux/syscalls.h> -#include <linux/kexec.h> +#include <linux/crash_core.h> #include <linux/kdb.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> @@ -45,6 +45,9 @@ #include <linux/utsname.h> #include <linux/ctype.h> #include <linux/uio.h> +#include <linux/sched/clock.h> +#include <linux/sched/debug.h> +#include <linux/sched/task_stack.h> #include <linux/uaccess.h> #include <asm/sections.h> @@ -213,17 +216,36 @@ static int nr_ext_console_drivers; static int __down_trylock_console_sem(unsigned long ip) { - if (down_trylock(&console_sem)) + int lock_failed; + unsigned long flags; + + /* + * Here and in __up_console_sem() we need to be in safe mode, + * because spindump/WARN/etc from under console ->lock will + * deadlock in printk()->down_trylock_console_sem() otherwise. + */ + printk_safe_enter_irqsave(flags); + lock_failed = down_trylock(&console_sem); + printk_safe_exit_irqrestore(flags); + + if (lock_failed) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; } #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) -#define up_console_sem() do { \ - mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ - up(&console_sem);\ -} while (0) +static void __up_console_sem(unsigned long ip) +{ + unsigned long flags; + + mutex_release(&console_lock_dep_map, 1, ip); + + printk_safe_enter_irqsave(flags); + up(&console_sem); + printk_safe_exit_irqrestore(flags); +} +#define up_console_sem() __up_console_sem(_RET_IP_) /* * This is used for debugging the mess that is the VT code by @@ -247,8 +269,8 @@ static struct console *exclusive_console; #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; +static int console_cmdline_cnt; -static int selected_console = -1; static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -351,6 +373,34 @@ __packed __aligned(4) */ DEFINE_RAW_SPINLOCK(logbuf_lock); +/* + * Helper macros to lock/unlock logbuf_lock and switch between + * printk-safe/unsafe modes. + */ +#define logbuf_lock_irq() \ + do { \ + printk_safe_enter_irq(); \ + raw_spin_lock(&logbuf_lock); \ + } while (0) + +#define logbuf_unlock_irq() \ + do { \ + raw_spin_unlock(&logbuf_lock); \ + printk_safe_exit_irq(); \ + } while (0) + +#define logbuf_lock_irqsave(flags) \ + do { \ + printk_safe_enter_irqsave(flags); \ + raw_spin_lock(&logbuf_lock); \ + } while (0) + +#define logbuf_unlock_irqrestore(flags) \ + do { \ + raw_spin_unlock(&logbuf_lock); \ + printk_safe_exit_irqrestore(flags); \ + } while (0) + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ @@ -782,20 +832,21 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; - raw_spin_lock_irq(&logbuf_lock); + + logbuf_lock_irq(); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); goto out; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); ret = wait_event_interruptible(log_wait, user->seq != log_next_seq); if (ret) goto out; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); } if (user->seq < log_first_seq) { @@ -803,7 +854,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_first_idx; user->seq = log_first_seq; ret = -EPIPE; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); goto out; } @@ -816,7 +867,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_next(user->idx); user->seq++; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); if (len > count) { ret = -EINVAL; @@ -843,7 +894,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); switch (whence) { case SEEK_SET: /* the first record */ @@ -867,7 +918,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) default: ret = -EINVAL; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); return ret; } @@ -881,7 +932,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) @@ -889,7 +940,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) else ret = POLLIN|POLLRDNORM; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); return ret; } @@ -919,10 +970,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); user->idx = log_first_idx; user->seq = log_first_seq; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); file->private_data = user; return 0; @@ -951,7 +1002,7 @@ const struct file_operations kmsg_fops = { .release = devkmsg_release, }; -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_CORE /* * This appends the listed symbols to /proc/vmcore * @@ -960,7 +1011,7 @@ const struct file_operations kmsg_fops = { * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. */ -void log_buf_kexec_setup(void) +void log_buf_vmcoreinfo_setup(void) { VMCOREINFO_SYMBOL(log_buf); VMCOREINFO_SYMBOL(log_buf_len); @@ -1064,13 +1115,13 @@ void __init setup_log_buf(int early) return; } - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN - log_next_idx; memcpy(log_buf, __log_buf, __LOG_BUF_LEN); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", @@ -1248,7 +1299,7 @@ static int syslog_print(char __user *buf, int size) size_t n; size_t skip; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1256,7 +1307,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial = 0; } if (syslog_seq == log_next_seq) { - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); break; } @@ -1275,7 +1326,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); if (!n) break; @@ -1304,7 +1355,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) if (!text) return -ENOMEM; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (buf) { u64 next_seq; u64 seq; @@ -1352,12 +1403,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear) idx = log_next(idx); seq++; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (seq < log_first_seq) { /* messages are gone, move to next one */ @@ -1371,7 +1422,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) clear_seq = log_next_seq; clear_idx = log_next_idx; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); kfree(text); return len; @@ -1458,7 +1509,7 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1486,7 +1537,7 @@ int do_syslog(int type, char __user *buf, int len, int source) } error -= syslog_partial; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: @@ -1510,8 +1561,7 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. */ -static void call_console_drivers(int level, - const char *ext_text, size_t ext_len, +static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) { struct console *con; @@ -1538,28 +1588,6 @@ static void call_console_drivers(int level, } } -/* - * Zap console related locks when oopsing. - * To leave time for slow consoles to print a full oops, - * only zap at most once every 30 seconds. - */ -static void zap_locks(void) -{ - static unsigned long oops_timestamp; - - if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30 * HZ)) - return; - - oops_timestamp = jiffies; - - debug_locks_off(); - /* If a crash is occurring, make sure we can't deadlock */ - raw_spin_lock_init(&logbuf_lock); - /* And make sure that we print immediately */ - sema_init(&console_sem, 1); -} - int printk_delay_msec __read_mostly; static inline void printk_delay(void) @@ -1669,18 +1697,13 @@ asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { - static bool recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len = 0; enum log_flags lflags = 0; unsigned long flags; - int this_cpu; int printed_len = 0; - int nmi_message_lost; bool in_sched = false; - /* cpu currently holding logbuf_lock in this function */ - static unsigned int logbuf_cpu = UINT_MAX; if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; @@ -1690,53 +1713,8 @@ asmlinkage int vprintk_emit(int facility, int level, boot_delay_msec(level); printk_delay(); - local_irq_save(flags); - this_cpu = smp_processor_id(); - - /* - * Ouch, printk recursed into itself! - */ - if (unlikely(logbuf_cpu == this_cpu)) { - /* - * If a crash is occurring during printk() on this CPU, - * then try to get the crash message out but make sure - * we can't deadlock. Otherwise just return to avoid the - * recursion and return - but flag the recursion so that - * it can be printed at the next appropriate moment: - */ - if (!oops_in_progress && !lockdep_recursing(current)) { - recursion_bug = true; - local_irq_restore(flags); - return 0; - } - zap_locks(); - } - - lockdep_off(); /* This stops the holder of console_sem just where we want him */ - raw_spin_lock(&logbuf_lock); - logbuf_cpu = this_cpu; - - if (unlikely(recursion_bug)) { - static const char recursion_msg[] = - "BUG: recent printk recursion!"; - - recursion_bug = false; - /* emit KERN_CRIT message */ - printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, - strlen(recursion_msg)); - } - - nmi_message_lost = get_nmi_message_lost(); - if (unlikely(nmi_message_lost)) { - text_len = scnprintf(textbuf, sizeof(textbuf), - "BAD LUCK: lost %d message(s) from NMI context!", - nmi_message_lost); - printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, textbuf, text_len); - } - + logbuf_lock_irqsave(flags); /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. @@ -1779,14 +1757,10 @@ asmlinkage int vprintk_emit(int facility, int level, printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); - logbuf_cpu = UINT_MAX; - raw_spin_unlock(&logbuf_lock); - lockdep_on(); - local_irq_restore(flags); + logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { - lockdep_off(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up @@ -1794,7 +1768,6 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (console_trylock()) console_unlock(); - lockdep_on(); } return printed_len; @@ -1803,7 +1776,7 @@ EXPORT_SYMBOL(vprintk_emit); asmlinkage int vprintk(const char *fmt, va_list args) { - return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + return vprintk_func(fmt, args); } EXPORT_SYMBOL(vprintk); @@ -1895,16 +1868,12 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } -static void call_console_drivers(int level, - const char *ext_text, size_t ext_len, +static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } -/* Still needs to be defined for users */ -DEFINE_PER_CPU(printk_func_t, printk_func); - #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK @@ -1937,24 +1906,38 @@ static int __add_preferred_console(char *name, int idx, char *options, * See if this tty is not yet registered, and * if we have a slot free. */ - for (i = 0, c = console_cmdline; - i < MAX_CMDLINECONSOLES && c->name[0]; - i++, c++) { + for (i = 0, c = console_cmdline; i < console_cmdline_cnt; i++, c++) { if (strcmp(c->name, name) == 0 && c->index == idx) { - if (!brl_options) - selected_console = i; + if (brl_options) + return 0; + + /* + * Maintain an invariant that will help to find if + * the matching console is preferred, see + * register_console(): + * + * The last non-braille console is always + * the preferred one. + */ + if (i != console_cmdline_cnt - 1) + swap(console_cmdline[i], + console_cmdline[console_cmdline_cnt - 1]); + + preferred_console = console_cmdline_cnt - 1; + return 0; } } if (i == MAX_CMDLINECONSOLES) return -E2BIG; if (!brl_options) - selected_console = i; + preferred_console = i; strlcpy(c->name, name, sizeof(c->name)); c->options = options; braille_set_options(c, brl_options); c->index = idx; + console_cmdline_cnt++; return 0; } /* @@ -2062,15 +2045,16 @@ void resume_console(void) * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages - * will be spooled but will not show up on the console. This function is - * called when a new CPU comes online (or fails to come up), and ensures - * that any such output gets printed. + * will be printed on the console only if there are CON_ANYTIME consoles. + * This function is called when a new CPU comes online (or fails to come + * up) or goes offline. */ static int console_cpu_notify(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - console_lock(); - console_unlock(); + /* If trylock fails, someone else is doing the printing */ + if (console_trylock()) + console_unlock(); } return 0; } @@ -2192,7 +2176,7 @@ void console_unlock(void) } /* - * Console drivers are called under logbuf_lock, so + * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may * end up dumping a lot of lines, for example, if called from * console registration path, and should invoke cond_resched() @@ -2200,11 +2184,15 @@ void console_unlock(void) * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more * messages practically incapacitating the system. + * + * console_trylock() is not able to detect the preemptive + * context reliably. Therefore the value must be stored before + * and cleared after the the "again" goto label. */ do_cond_resched = console_may_schedule; +again: console_may_schedule = 0; -again: /* * We released the console_sem lock, so we need to recheck if * cpu is online and (if not) is there at least one CON_ANYTIME @@ -2220,9 +2208,9 @@ again: struct printk_log *msg; size_t ext_len = 0; size_t len; - int level; - raw_spin_lock_irqsave(&logbuf_lock, flags); + printk_safe_enter_irqsave(flags); + raw_spin_lock(&logbuf_lock); if (seen_seq != log_next_seq) { wake_klogd = true; seen_seq = log_next_seq; @@ -2243,8 +2231,7 @@ skip: break; msg = log_from_idx(console_idx); - level = msg->level; - if (suppress_message_printing(level)) { + if (suppress_message_printing(msg->level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and @@ -2270,9 +2257,9 @@ skip: raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(level, ext_text, ext_len, text, len); + call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); - local_irq_restore(flags); + printk_safe_exit_irqrestore(flags); if (do_cond_resched) cond_resched(); @@ -2295,7 +2282,8 @@ skip: */ raw_spin_lock(&logbuf_lock); retry = console_seq != log_next_seq; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock(&logbuf_lock); + printk_safe_exit_irqrestore(flags); if (retry && console_trylock()) goto again; @@ -2440,6 +2428,7 @@ void register_console(struct console *newcon) unsigned long flags; struct console *bcon = NULL; struct console_cmdline *c; + static bool has_preferred; if (console_drivers) for_each_console(bcon) @@ -2466,15 +2455,15 @@ void register_console(struct console *newcon) if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; - if (preferred_console < 0 || bcon || !console_drivers) - preferred_console = selected_console; + if (!has_preferred || bcon || !console_drivers) + has_preferred = preferred_console >= 0; /* * See if we want to use this console driver. If we * didn't select a console we take the first one * that registers here. */ - if (preferred_console < 0) { + if (!has_preferred) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || @@ -2482,18 +2471,29 @@ void register_console(struct console *newcon) newcon->flags |= CON_ENABLED; if (newcon->device) { newcon->flags |= CON_CONSDEV; - preferred_console = 0; + has_preferred = true; } } } /* - * See if this console matches one we selected on - * the command line. + * See if this console matches one we selected on the command line. + * + * There may be several entries in the console_cmdline array matching + * with the same console, one with newcon->match(), another by + * name/index: + * + * pl011,mmio,0x87e024000000,115200 -- added from SPCR + * ttyAMA0 -- added from command line + * + * Traverse the console_cmdline array in reverse order to be + * sure that if this console is preferred then it will be the first + * matching entry. We use the invariant that is maintained in + * __add_preferred_console(). */ - for (i = 0, c = console_cmdline; - i < MAX_CMDLINECONSOLES && c->name[0]; - i++, c++) { + for (i = console_cmdline_cnt - 1; i >= 0; i--) { + c = console_cmdline + i; + if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ @@ -2515,9 +2515,9 @@ void register_console(struct console *newcon) } newcon->flags |= CON_ENABLED; - if (i == selected_console) { + if (i == preferred_console) { newcon->flags |= CON_CONSDEV; - preferred_console = selected_console; + has_preferred = true; } break; } @@ -2558,10 +2558,10 @@ void register_console(struct console *newcon) * console_unlock(); will print out the buffered messages * for us. */ - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); console_seq = syslog_seq; console_idx = syslog_idx; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to @@ -2642,6 +2642,30 @@ int unregister_console(struct console *console) EXPORT_SYMBOL(unregister_console); /* + * Initialize the console device. This is called *early*, so + * we can't necessarily depend on lots of kernel help here. + * Just do some early initializations, and do the complex setup + * later. + */ +void __init console_init(void) +{ + initcall_t *call; + + /* Setup the default TTY line discipline. */ + n_tty_init(); + + /* + * set up the console device so that later boot sequences can + * inform about problems etc.. + */ + call = __con_initcall_start; + while (call < __con_initcall_end) { + (*call)(); + call++; + } +} + +/* * Some boot consoles access data that is in the init section and which will * be discarded after the initcalls have been run. To make sure that no code * will access this data, unregister the boot consoles in a late initcall. @@ -2860,12 +2884,12 @@ void kmsg_dump(enum kmsg_dump_reason reason) /* initialize iterator with data about the stored records */ dumper->active = true; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); dumper->cur_seq = clear_seq; dumper->cur_idx = clear_idx; dumper->next_seq = log_next_seq; dumper->next_idx = log_next_idx; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); /* invoke dumper which will iterate over records */ dumper->dump(dumper, reason); @@ -2950,9 +2974,9 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, unsigned long flags; bool ret; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); return ret; } @@ -2991,7 +3015,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, if (!dumper->active) goto out; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); if (dumper->cur_seq < log_first_seq) { /* messages are gone, move to first available one */ dumper->cur_seq = log_first_seq; @@ -3000,7 +3024,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* last entry */ if (dumper->cur_seq >= dumper->next_seq) { - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); goto out; } @@ -3042,7 +3066,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, dumper->next_seq = next_seq; dumper->next_idx = next_idx; ret = true; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); out: if (len) *len = l; @@ -3080,9 +3104,9 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) { unsigned long flags; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); kmsg_dump_rewind_nolock(dumper); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); diff --git a/kernel/printk/nmi.c b/kernel/printk/printk_safe.c index f011aaef583c..033e50a7d706 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/printk_safe.c @@ -1,5 +1,5 @@ /* - * nmi.c - Safe printk in NMI context + * printk_safe.c - Safe printk for printk-deadlock-prone contexts * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -32,36 +32,58 @@ * is later flushed into the main ring buffer via IRQ work. * * The alternative implementation is chosen transparently - * via @printk_func per-CPU variable. + * by examinig current printk() context mask stored in @printk_context + * per-CPU variable. * * The implementation allows to flush the strings also from another CPU. * There are situations when we want to make sure that all buffers * were handled or when IRQs are blocked. */ -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; -static int printk_nmi_irq_ready; -atomic_t nmi_message_lost; +static int printk_safe_irq_ready; -#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \ - sizeof(atomic_t) - sizeof(struct irq_work)) +#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ + sizeof(atomic_t) - \ + sizeof(atomic_t) - \ + sizeof(struct irq_work)) -struct nmi_seq_buf { +struct printk_safe_seq_buf { atomic_t len; /* length of written data */ + atomic_t message_lost; struct irq_work work; /* IRQ work that flushes the buffer */ - unsigned char buffer[NMI_LOG_BUF_LEN]; + unsigned char buffer[SAFE_LOG_BUF_LEN]; }; -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq); +static DEFINE_PER_CPU(int, printk_context); + +#ifdef CONFIG_PRINTK_NMI +static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); +#endif + +/* Get flushed in a more safe context. */ +static void queue_flush_work(struct printk_safe_seq_buf *s) +{ + if (printk_safe_irq_ready) { + /* Make sure that IRQ work is really initialized. */ + smp_rmb(); + irq_work_queue(&s->work); + } +} /* - * Safe printk() for NMI context. It uses a per-CPU buffer to - * store the message. NMIs are not nested, so there is always only - * one writer running. But the buffer might get flushed from another - * CPU, so we need to be careful. + * Add a message to per-CPU context-dependent buffer. NMI and printk-safe + * have dedicated buffers, because otherwise printk-safe preempted by + * NMI-printk would have overwritten the NMI messages. + * + * The messages are fushed from irq work (or from panic()), possibly, + * from other CPU, concurrently with printk_safe_log_store(). Should this + * happen, printk_safe_log_store() will notice the buffer->len mismatch + * and repeat the write. */ -static int vprintk_nmi(const char *fmt, va_list args) +static int printk_safe_log_store(struct printk_safe_seq_buf *s, + const char *fmt, va_list args) { - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - int add = 0; + int add; size_t len; again: @@ -69,18 +91,21 @@ again: /* The trailing '\0' is not counted into len. */ if (len >= sizeof(s->buffer) - 1) { - atomic_inc(&nmi_message_lost); + atomic_inc(&s->message_lost); + queue_flush_work(s); return 0; } /* - * Make sure that all old data have been read before the buffer was - * reseted. This is not needed when we just append data. + * Make sure that all old data have been read before the buffer + * was reset. This is not needed when we just append data. */ if (!len) smp_rmb(); add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + if (!add) + return 0; /* * Do it once again if the buffer has been flushed in the meantime. @@ -90,32 +115,23 @@ again: if (atomic_cmpxchg(&s->len, len, len + add) != len) goto again; - /* Get flushed in a more safe context. */ - if (add && printk_nmi_irq_ready) { - /* Make sure that IRQ work is really initialized. */ - smp_rmb(); - irq_work_queue(&s->work); - } - + queue_flush_work(s); return add; } -static void printk_nmi_flush_line(const char *text, int len) +static inline void printk_safe_flush_line(const char *text, int len) { /* - * The buffers are flushed in NMI only on panic. The messages must - * go only into the ring buffer at this stage. Consoles will get - * explicitly called later when a crashdump is not generated. + * Avoid any console drivers calls from here, because we may be + * in NMI or printk_safe context (when in panic). The messages + * must go only into the ring buffer at this stage. Consoles will + * get explicitly called later when a crashdump is not generated. */ - if (in_nmi()) - printk_deferred("%.*s", len, text); - else - printk("%.*s", len, text); - + printk_deferred("%.*s", len, text); } /* printk part of the temporary buffer line by line */ -static int printk_nmi_flush_buffer(const char *start, size_t len) +static int printk_safe_flush_buffer(const char *start, size_t len) { const char *c, *end; bool header; @@ -127,7 +143,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len) /* Print line by line. */ while (c < end) { if (*c == '\n') { - printk_nmi_flush_line(start, c - start + 1); + printk_safe_flush_line(start, c - start + 1); start = ++c; header = true; continue; @@ -140,7 +156,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len) continue; } - printk_nmi_flush_line(start, c - start); + printk_safe_flush_line(start, c - start); start = c++; header = true; continue; @@ -154,22 +170,31 @@ static int printk_nmi_flush_buffer(const char *start, size_t len) if (start < end && !header) { static const char newline[] = KERN_CONT "\n"; - printk_nmi_flush_line(start, end - start); - printk_nmi_flush_line(newline, strlen(newline)); + printk_safe_flush_line(start, end - start); + printk_safe_flush_line(newline, strlen(newline)); } return len; } +static void report_message_lost(struct printk_safe_seq_buf *s) +{ + int lost = atomic_xchg(&s->message_lost, 0); + + if (lost) + printk_deferred("Lost %d message(s)!\n", lost); +} + /* - * Flush data from the associated per_CPU buffer. The function + * Flush data from the associated per-CPU buffer. The function * can be called either via IRQ work or independently. */ -static void __printk_nmi_flush(struct irq_work *work) +static void __printk_safe_flush(struct irq_work *work) { static raw_spinlock_t read_lock = __RAW_SPIN_LOCK_INITIALIZER(read_lock); - struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); + struct printk_safe_seq_buf *s = + container_of(work, struct printk_safe_seq_buf, work); unsigned long flags; size_t len; int i; @@ -194,9 +219,9 @@ more: * buffer size. */ if ((i && i >= len) || len > sizeof(s->buffer)) { - const char *msg = "printk_nmi_flush: internal error\n"; + const char *msg = "printk_safe_flush: internal error\n"; - printk_nmi_flush_line(msg, strlen(msg)); + printk_safe_flush_line(msg, strlen(msg)); len = 0; } @@ -205,7 +230,7 @@ more: /* Make sure that data has been written up to the @len */ smp_rmb(); - i += printk_nmi_flush_buffer(s->buffer + i, len - i); + i += printk_safe_flush_buffer(s->buffer + i, len - i); /* * Check that nothing has got added in the meantime and truncate @@ -217,35 +242,40 @@ more: goto more; out: + report_message_lost(s); raw_spin_unlock_irqrestore(&read_lock, flags); } /** - * printk_nmi_flush - flush all per-cpu nmi buffers. + * printk_safe_flush - flush all per-cpu nmi buffers. * * The buffers are flushed automatically via IRQ work. This function * is useful only when someone wants to be sure that all buffers have * been flushed at some point. */ -void printk_nmi_flush(void) +void printk_safe_flush(void) { int cpu; - for_each_possible_cpu(cpu) - __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); + for_each_possible_cpu(cpu) { +#ifdef CONFIG_PRINTK_NMI + __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work); +#endif + __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work); + } } /** - * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system + * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system * goes down. * - * Similar to printk_nmi_flush() but it can be called even in NMI context when + * Similar to printk_safe_flush() but it can be called even in NMI context when * the system goes down. It does the best effort to get NMI messages into * the main ring buffer. * * Note that it could try harder when there is only one CPU online. */ -void printk_nmi_flush_on_panic(void) +void printk_safe_flush_on_panic(void) { /* * Make sure that we could access the main ring buffer. @@ -259,33 +289,97 @@ void printk_nmi_flush_on_panic(void) raw_spin_lock_init(&logbuf_lock); } - printk_nmi_flush(); + printk_safe_flush(); } -void __init printk_nmi_init(void) +#ifdef CONFIG_PRINTK_NMI +/* + * Safe printk() for NMI context. It uses a per-CPU buffer to + * store the message. NMIs are not nested, so there is always only + * one writer running. But the buffer might get flushed from another + * CPU, so we need to be careful. + */ +static int vprintk_nmi(const char *fmt, va_list args) { - int cpu; + struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - for_each_possible_cpu(cpu) { - struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); + return printk_safe_log_store(s, fmt, args); +} - init_irq_work(&s->work, __printk_nmi_flush); - } +void printk_nmi_enter(void) +{ + this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); +} - /* Make sure that IRQ works are initialized before enabling. */ - smp_wmb(); - printk_nmi_irq_ready = 1; +void printk_nmi_exit(void) +{ + this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); +} - /* Flush pending messages that did not have scheduled IRQ works. */ - printk_nmi_flush(); +#else + +static int vprintk_nmi(const char *fmt, va_list args) +{ + return 0; } -void printk_nmi_enter(void) +#endif /* CONFIG_PRINTK_NMI */ + +/* + * Lock-less printk(), to avoid deadlocks should the printk() recurse + * into itself. It uses a per-CPU buffer to store the message, just like + * NMI. + */ +static int vprintk_safe(const char *fmt, va_list args) { - this_cpu_write(printk_func, vprintk_nmi); + struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); + + return printk_safe_log_store(s, fmt, args); } -void printk_nmi_exit(void) +/* Can be preempted by NMI. */ +void __printk_safe_enter(void) +{ + this_cpu_inc(printk_context); +} + +/* Can be preempted by NMI. */ +void __printk_safe_exit(void) { - this_cpu_write(printk_func, vprintk_default); + this_cpu_dec(printk_context); +} + +__printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) + return vprintk_nmi(fmt, args); + + if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) + return vprintk_safe(fmt, args); + + return vprintk_default(fmt, args); +} + +void __init printk_safe_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct printk_safe_seq_buf *s; + + s = &per_cpu(safe_print_seq, cpu); + init_irq_work(&s->work, __printk_safe_flush); + +#ifdef CONFIG_PRINTK_NMI + s = &per_cpu(nmi_print_seq, cpu); + init_irq_work(&s->work, __printk_safe_flush); +#endif + } + + /* Make sure that IRQ works are initialized before enabling. */ + smp_wmb(); + printk_safe_irq_ready = 1; + + /* Flush pending messages that did not have scheduled IRQ works. */ + printk_safe_flush(); } diff --git a/kernel/profile.c b/kernel/profile.c index f67ce0aa6bc4..9aa2a4445b0d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -25,6 +25,8 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/sched/stat.h> + #include <asm/sections.h> #include <asm/irq_regs.h> #include <asm/ptrace.h> diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 49ba7c1ade9d..266ddcc1d8bb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -10,6 +10,9 @@ #include <linux/capability.h> #include <linux/export.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> +#include <linux/sched/task.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/highmem.h> @@ -181,11 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task) WARN_ON(!task->ptrace || task->parent != current); + /* + * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely. + * Recheck state under the lock to close this race. + */ spin_lock_irq(&task->sighand->siglock); - if (__fatal_signal_pending(task)) - wake_up_state(task, __TASK_TRACED); - else - task->state = TASK_TRACED; + if (task->state == __TASK_TRACED) { + if (__fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + else + task->state = TASK_TRACED; + } spin_unlock_irq(&task->sighand->siglock); } diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 18dfc485225c..23803c7d5180 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,10 +3,13 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o -obj-$(CONFIG_SRCU) += srcu.o +obj-$(CONFIG_CLASSIC_SRCU) += srcu.o +obj-$(CONFIG_TREE_SRCU) += srcutree.o +obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o obj-$(CONFIG_TINY_RCU) += tiny.o +obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0d6ff3e471be..73e16ec4054b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -56,6 +56,83 @@ #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ DYNTICK_TASK_FLAG) + +/* + * Grace-period counter management. + */ + +#define RCU_SEQ_CTR_SHIFT 2 +#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) + +/* + * Return the counter portion of a sequence number previously returned + * by rcu_seq_snap() or rcu_seq_current(). + */ +static inline unsigned long rcu_seq_ctr(unsigned long s) +{ + return s >> RCU_SEQ_CTR_SHIFT; +} + +/* + * Return the state portion of a sequence number previously returned + * by rcu_seq_snap() or rcu_seq_current(). + */ +static inline int rcu_seq_state(unsigned long s) +{ + return s & RCU_SEQ_STATE_MASK; +} + +/* + * Set the state portion of the pointed-to sequence number. + * The caller is responsible for preventing conflicting updates. + */ +static inline void rcu_seq_set_state(unsigned long *sp, int newstate) +{ + WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK); + WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate); +} + +/* Adjust sequence number for start of update-side operation. */ +static inline void rcu_seq_start(unsigned long *sp) +{ + WRITE_ONCE(*sp, *sp + 1); + smp_mb(); /* Ensure update-side operation after counter increment. */ + WARN_ON_ONCE(rcu_seq_state(*sp) != 1); +} + +/* Adjust sequence number for end of update-side operation. */ +static inline void rcu_seq_end(unsigned long *sp) +{ + smp_mb(); /* Ensure update-side operation before counter increment. */ + WARN_ON_ONCE(!rcu_seq_state(*sp)); + WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); +} + +/* Take a snapshot of the update side's sequence number. */ +static inline unsigned long rcu_seq_snap(unsigned long *sp) +{ + unsigned long s; + + s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK; + smp_mb(); /* Above access must not bleed into critical section. */ + return s; +} + +/* Return the current value the update side's sequence number, no ordering. */ +static inline unsigned long rcu_seq_current(unsigned long *sp) +{ + return READ_ONCE(*sp); +} + +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred. + */ +static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_GE(READ_ONCE(*sp), s); +} + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the @@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { - RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); + RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } else { - RCU_TRACE(trace_rcu_invoke_callback(rn, head)); + RCU_TRACE(trace_rcu_invoke_callback(rn, head);) head->func(head); rcu_lock_release(&rcu_callback_map); return false; @@ -144,4 +221,76 @@ void rcu_test_sync_prims(void); */ extern void resched_cpu(int cpu); +#if defined(SRCU) || !defined(TINY_RCU) + +#include <linux/rcu_node_tree.h> + +extern int rcu_num_lvls; +extern int num_rcu_lvl[]; +extern int rcu_num_nodes; +static bool rcu_fanout_exact; +static int rcu_fanout_leaf; + +/* + * Compute the per-level fanout, either using the exact fanout specified + * or balancing the tree, depending on the rcu_fanout_exact boot parameter. + */ +static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) +{ + int i; + + if (rcu_fanout_exact) { + levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) + levelspread[i] = RCU_FANOUT; + } else { + int ccur; + int cprv; + + cprv = nr_cpu_ids; + for (i = rcu_num_lvls - 1; i >= 0; i--) { + ccur = levelcnt[i]; + levelspread[i] = (cprv + ccur - 1) / ccur; + cprv = ccur; + } + } +} + +/* + * Do a full breadth-first scan of the rcu_node structures for the + * specified rcu_state structure. + */ +#define rcu_for_each_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) + +/* + * Do a breadth-first scan of the non-leaf rcu_node structures for the + * specified rcu_state structure. Note that if there is a singleton + * rcu_node tree with but one rcu_node structure, this loop is a no-op. + */ +#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) + +/* + * Scan the leaves of the rcu_node hierarchy for the specified rcu_state + * structure. Note that if there is a singleton rcu_node tree with but + * one rcu_node structure, this loop -will- visit the rcu_node structure. + * It is still a leaf node, even if it is also the root node. + */ +#define rcu_for_each_leaf_node(rsp, rnp) \ + for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) + +/* + * Iterate over all possible CPUs in a leaf RCU node. + */ +#define for_each_leaf_node_possible_cpu(rnp, cpu) \ + for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ + cpu <= rnp->grphi; \ + cpu = cpumask_next((cpu), cpu_possible_mask)) + +#endif /* #if defined(SRCU) || !defined(TINY_RCU) */ + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c new file mode 100644 index 000000000000..2b62a38b080f --- /dev/null +++ b/kernel/rcu/rcu_segcblist.c @@ -0,0 +1,505 @@ +/* + * RCU segmented callback lists, function definitions + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> + +#include "rcu_segcblist.h" + +/* Initialize simple callback list. */ +void rcu_cblist_init(struct rcu_cblist *rclp) +{ + rclp->head = NULL; + rclp->tail = &rclp->head; + rclp->len = 0; + rclp->len_lazy = 0; +} + +/* + * Debug function to actually count the number of callbacks. + * If the number exceeds the limit specified, return -1. + */ +long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) +{ + int cnt = 0; + struct rcu_head **rhpp = &rclp->head; + + for (;;) { + if (!*rhpp) + return cnt; + if (++cnt > lim) + return -1; + rhpp = &(*rhpp)->next; + } +} + +/* + * Dequeue the oldest rcu_head structure from the specified callback + * list. This function assumes that the callback is non-lazy, but + * the caller can later invoke rcu_cblist_dequeued_lazy() if it + * finds otherwise (and if it cares about laziness). This allows + * different users to have different ways of determining laziness. + */ +struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) +{ + struct rcu_head *rhp; + + rhp = rclp->head; + if (!rhp) + return NULL; + rclp->len--; + rclp->head = rhp->next; + if (!rclp->head) + rclp->tail = &rclp->head; + return rhp; +} + +/* + * Initialize an rcu_segcblist structure. + */ +void rcu_segcblist_init(struct rcu_segcblist *rsclp) +{ + int i; + + BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); + BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); + rsclp->head = NULL; + for (i = 0; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = &rsclp->head; + rsclp->len = 0; + rsclp->len_lazy = 0; +} + +/* + * Disable the specified rcu_segcblist structure, so that callbacks can + * no longer be posted to it. This structure must be empty. + */ +void rcu_segcblist_disable(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); + rsclp->tails[RCU_NEXT_TAIL] = NULL; +} + +/* + * Is the specified segment of the specified rcu_segcblist structure + * empty of callbacks? + */ +bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) +{ + if (seg == RCU_DONE_TAIL) + return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; + return rsclp->tails[seg - 1] == rsclp->tails[seg]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are ready to be invoked? + */ +bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are still pending, that is, not yet ready to be invoked? + */ +bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); +} + +/* + * Dequeue and return the first ready-to-invoke callback. If there + * are no ready-to-invoke callbacks, return NULL. Disables interrupts + * to avoid interference. Does not protect from interference from other + * CPUs or tasks. + */ +struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + int i; + struct rcu_head *rhp; + + local_irq_save(flags); + if (!rcu_segcblist_ready_cbs(rsclp)) { + local_irq_restore(flags); + return NULL; + } + rhp = rsclp->head; + BUG_ON(!rhp); + rsclp->head = rhp->next; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { + if (rsclp->tails[i] != &rhp->next) + break; + rsclp->tails[i] = &rsclp->head; + } + smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ + WRITE_ONCE(rsclp->len, rsclp->len - 1); + local_irq_restore(flags); + return rhp; +} + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + + local_irq_save(flags); + rsclp->len_lazy--; + local_irq_restore(flags); +} + +/* + * Return a pointer to the first callback in the specified rcu_segcblist + * structure. This is useful for diagnostics. + */ +struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return rsclp->head; + return NULL; +} + +/* + * Return a pointer to the first pending callback in the specified + * rcu_segcblist structure. This is useful just after posting a given + * callback -- if that callback is the first pending callback, then + * you cannot rely on someone else having already started up the required + * grace period. + */ +struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return *rsclp->tails[RCU_DONE_TAIL]; + return NULL; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * have not yet been processed beyond having been posted, that is, + * does it contain callbacks in its last segment? + */ +bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); +} + +/* + * Enqueue the specified callback onto the specified rcu_segcblist + * structure, updating accounting as needed. Note that the ->len + * field may be accessed locklessly, hence the WRITE_ONCE(). + * The ->len field is used by rcu_barrier() and friends to determine + * if it must post a callback on this structure, and it is OK + * for rcu_barrier() to sometimes post callbacks needlessly, but + * absolutely not OK for it to ever miss posting a callback. + */ +void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is enqueued. */ + rhp->next = NULL; + *rsclp->tails[RCU_NEXT_TAIL] = rhp; + rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; +} + +/* + * Entrain the specified callback onto the specified rcu_segcblist at + * the end of the last non-empty segment. If the entire rcu_segcblist + * is empty, make no change, but return false. + * + * This is intended for use by rcu_barrier()-like primitives, -not- + * for normal grace-period use. IMPORTANT: The callback you enqueue + * will wait for all prior callbacks, NOT necessarily for a grace + * period. You have been warned. + */ +bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + int i; + + if (rcu_segcblist_n_cbs(rsclp) == 0) + return false; + WRITE_ONCE(rsclp->len, rsclp->len + 1); + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is entrained. */ + rhp->next = NULL; + for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1]) + break; + *rsclp->tails[i] = rhp; + for (; i <= RCU_NEXT_TAIL; i++) + rsclp->tails[i] = &rhp->next; + return true; +} + +/* + * Extract only the counts from the specified rcu_segcblist structure, + * and place them in the specified rcu_cblist structure. This function + * supports both callback orphaning and invocation, hence the separation + * of counts and callbacks. (Callbacks ready for invocation must be + * orphaned and adopted separately from pending callbacks, but counts + * apply to all callbacks. Locking must be used to make sure that + * both orphaned-callbacks lists are consistent.) + */ +void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rclp->len_lazy += rsclp->len_lazy; + rclp->len += rsclp->len; + rsclp->len_lazy = 0; + WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ +} + +/* + * Extract only those callbacks ready to be invoked from the specified + * rcu_segcblist structure and place them in the specified rcu_cblist + * structure. + */ +void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_ready_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = rsclp->head; + rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + rclp->tail = rsclp->tails[RCU_DONE_TAIL]; + for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) + if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) + rsclp->tails[i] = &rsclp->head; +} + +/* + * Extract only those callbacks still pending (not yet ready to be + * invoked) from the specified rcu_segcblist structure and place them in + * the specified rcu_cblist structure. Note that this loses information + * about any callbacks that might have been partway done waiting for + * their grace period. Too bad! They will have to start over. + */ +void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_pend_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; + rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Insert counts from the specified rcu_cblist structure in the + * specified rcu_segcblist structure. + */ +void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rsclp->len_lazy += rclp->len_lazy; + /* ->len sampled locklessly. */ + WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); + rclp->len_lazy = 0; + rclp->len = 0; +} + +/* + * Move callbacks from the specified rcu_cblist to the beginning of the + * done-callbacks segment of the specified rcu_segcblist. + */ +void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rclp->head) + return; /* No callbacks to move. */ + *rclp->tail = rsclp->head; + rsclp->head = rclp->head; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) + if (&rsclp->head == rsclp->tails[i]) + rsclp->tails[i] = rclp->tail; + else + break; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Move callbacks from the specified rcu_cblist to the end of the + * new-callbacks segment of the specified rcu_segcblist. + */ +void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + if (!rclp->head) + return; /* Nothing to do. */ + *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; + rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Advance the callbacks in the specified rcu_segcblist structure based + * on the current value passed in for the grace-period counter. + */ +void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) +{ + int i, j; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return; + + /* + * Find all callbacks whose ->gp_seq numbers indicate that they + * are ready to invoke, and put them into the RCU_DONE_TAIL segment. + */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { + if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + break; + rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; + } + + /* If no callbacks moved, nothing more need be done. */ + if (i == RCU_WAIT_TAIL) + return; + + /* Clean up tail pointers that might have been misordered above. */ + for (j = RCU_WAIT_TAIL; j < i; j++) + rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; + + /* + * Callbacks moved, so clean up the misordered ->tails[] pointers + * that now point into the middle of the list of ready-to-invoke + * callbacks. The overall effect is to copy down the later pointers + * into the gap that was created by the now-ready segments. + */ + for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { + if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) + break; /* No more callbacks. */ + rsclp->tails[j] = rsclp->tails[i]; + rsclp->gp_seq[j] = rsclp->gp_seq[i]; + } +} + +/* + * "Accelerate" callbacks based on more-accurate grace-period information. + * The reason for this is that RCU does not synchronize the beginnings and + * ends of grace periods, and that callbacks are posted locally. This in + * turn means that the callbacks must be labelled conservatively early + * on, as getting exact information would degrade both performance and + * scalability. When more accurate grace-period information becomes + * available, previously posted callbacks can be "accelerated", marking + * them to complete at the end of the earlier grace period. + * + * This function operates on an rcu_segcblist structure, and also the + * grace-period sequence number seq at which new callbacks would become + * ready to invoke. Returns true if there are callbacks that won't be + * ready to invoke until seq, false otherwise. + */ +bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) +{ + int i; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return false; + + /* + * Find the segment preceding the oldest segment of callbacks + * whose ->gp_seq[] completion is at or after that passed in via + * "seq", skipping any empty segments. This oldest segment, along + * with any later segments, can be merged in with any newly arrived + * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq" + * as their ->gp_seq[] grace-period completion sequence number. + */ + for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1] && + ULONG_CMP_LT(rsclp->gp_seq[i], seq)) + break; + + /* + * If all the segments contain callbacks that correspond to + * earlier grace-period sequence numbers than "seq", leave. + * Assuming that the rcu_segcblist structure has enough + * segments in its arrays, this can only happen if some of + * the non-done segments contain callbacks that really are + * ready to invoke. This situation will get straightened + * out by the next call to rcu_segcblist_advance(). + * + * Also advance to the oldest segment of callbacks whose + * ->gp_seq[] completion is at or after that passed in via "seq", + * skipping any empty segments. + */ + if (++i >= RCU_NEXT_TAIL) + return false; + + /* + * Merge all later callbacks, including newly arrived callbacks, + * into the segment located by the for-loop above. Assign "seq" + * as the ->gp_seq[] value in order to correctly handle the case + * where there were no pending callbacks in the rcu_segcblist + * structure other than in the RCU_NEXT_TAIL segment. + */ + for (; i < RCU_NEXT_TAIL; i++) { + rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; + rsclp->gp_seq[i] = seq; + } + return true; +} + +/* + * Scan the specified rcu_segcblist structure for callbacks that need + * a grace period later than the one specified by "seq". We don't look + * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't + * have a grace-period sequence number. + */ +bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i; + + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) + if (rsclp->tails[i - 1] != rsclp->tails[i] && + ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + return true; + return false; +} diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h new file mode 100644 index 000000000000..6e36e36478cd --- /dev/null +++ b/kernel/rcu/rcu_segcblist.h @@ -0,0 +1,164 @@ +/* + * RCU segmented callback lists, internal-to-rcu header file + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +#include <linux/rcu_segcblist.h> + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) +{ + rclp->len_lazy--; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) +{ + return rclp->head; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) +{ + WARN_ON_ONCE(!rclp->head); + return rclp->tail; +} + +void rcu_cblist_init(struct rcu_cblist *rclp); +long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim); +struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); + +/* + * Is the specified rcu_segcblist structure empty? + * + * But careful! The fact that the ->head field is NULL does not + * necessarily imply that there are no callbacks associated with + * this structure. When callbacks are being invoked, they are + * removed as a group. If callback invocation must be preempted, + * the remaining callbacks will be added back to the list. Either + * way, the counts are updated later. + * + * So it is often the case that rcu_segcblist_n_cbs() should be used + * instead. + */ +static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) +{ + return !rsclp->head; +} + +/* Return number of callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) +{ + return READ_ONCE(rsclp->len); +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len_lazy; +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len - rsclp->len_lazy; +} + +/* + * Is the specified rcu_segcblist enabled, for example, not corresponding + * to an offline or callback-offloaded CPU? + */ +static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) +{ + return !!rsclp->tails[RCU_NEXT_TAIL]; +} + +/* + * Are all segments following the specified segment of the specified + * rcu_segcblist structure empty of callbacks? (The specified + * segment might well contain callbacks.) + */ +static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) +{ + return !*rsclp->tails[seg]; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) +{ + return rsclp->head; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); + return rsclp->tails[RCU_NEXT_TAIL]; +} + +void rcu_segcblist_init(struct rcu_segcblist *rsclp); +void rcu_segcblist_disable(struct rcu_segcblist *rsclp); +bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg); +bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); +bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); +struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp); +void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp); +struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); +struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); +bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp); +void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy); +bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy); +void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); +bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); +bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, + unsigned long seq); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 123ccbd22449..a4a86fb47e4a 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -30,6 +30,7 @@ #include <linux/rcupdate.h> #include <linux/interrupt.h> #include <linux/sched.h> +#include <uapi/linux/sched/types.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/completion.h> diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 87c51225ceec..ae6e574d4cf5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -32,7 +32,8 @@ #include <linux/smp.h> #include <linux/rcupdate.h> #include <linux/interrupt.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <uapi/linux/sched/types.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/completion.h> @@ -558,19 +559,61 @@ static void srcu_torture_barrier(void) static void srcu_torture_stats(void) { - int cpu; - int idx = srcu_ctlp->completed & 0x1; + int __maybe_unused cpu; + int idx; - pr_alert("%s%s per-CPU(idx=%d):", +#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU) +#ifdef CONFIG_TREE_SRCU + idx = srcu_ctlp->srcu_idx & 0x1; +#else /* #ifdef CONFIG_TREE_SRCU */ + idx = srcu_ctlp->completed & 0x1; +#endif /* #else #ifdef CONFIG_TREE_SRCU */ + pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; long c0, c1; +#ifdef CONFIG_TREE_SRCU + struct srcu_data *counts; + + counts = per_cpu_ptr(srcu_ctlp->sda, cpu); + u0 = counts->srcu_unlock_count[!idx]; + u1 = counts->srcu_unlock_count[idx]; +#else /* #ifdef CONFIG_TREE_SRCU */ + struct srcu_array *counts; + + counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); + u0 = counts->unlock_count[!idx]; + u1 = counts->unlock_count[idx]; +#endif /* #else #ifdef CONFIG_TREE_SRCU */ - c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx]; - c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx]; + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + +#ifdef CONFIG_TREE_SRCU + l0 = counts->srcu_lock_count[!idx]; + l1 = counts->srcu_lock_count[idx]; +#else /* #ifdef CONFIG_TREE_SRCU */ + l0 = counts->lock_count[!idx]; + l1 = counts->lock_count[idx]; +#endif /* #else #ifdef CONFIG_TREE_SRCU */ + + c0 = l0 - u0; + c1 = l1 - u1; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } pr_cont("\n"); +#elif defined(CONFIG_TINY_SRCU) + idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", + torture_type, TORTURE_FLAG, idx, + READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), + READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); +#endif } static void srcu_torture_synchronize_expedited(void) @@ -1317,12 +1360,14 @@ rcu_torture_stats_print(void) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && rcu_torture_current != NULL) { - int __maybe_unused flags; - unsigned long __maybe_unused gpnum; - unsigned long __maybe_unused completed; + int __maybe_unused flags = 0; + unsigned long __maybe_unused gpnum = 0; + unsigned long __maybe_unused completed = 0; rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, + &flags, &gpnum, &completed); wtp = READ_ONCE(writer_task); pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", rcu_torture_writer_state_getname(), diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 9b9cdd549caa..584d8a983883 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -22,7 +22,7 @@ * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt + * Documentation/RCU/ *.txt * */ @@ -30,7 +30,7 @@ #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/preempt.h> -#include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/sched.h> #include <linux/smp.h> #include <linux/delay.h> @@ -106,7 +106,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) rcu_batch_init(&sp->batch_check1); rcu_batch_init(&sp->batch_done); INIT_DELAYED_WORK(&sp->work, process_srcu); - sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + sp->per_cpu_ref = alloc_percpu(struct srcu_array); return sp->per_cpu_ref ? 0 : -ENOMEM; } @@ -141,114 +141,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* - * Returns approximate total of the readers' ->seq[] values for the + * Returns approximate total of the readers' ->lock_count[] values for the * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; - unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); - sum += t; + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->lock_count[idx]); } return sum; } /* - * Returns approximate number of readers active on the specified rank - * of the per-CPU ->c[] counters. + * Returns approximate total of the readers' ->unlock_count[] values for the + * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; - unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); - sum += t; + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->unlock_count[idx]); } return sum; } /* * Return true if the number of pre-existing readers is determined to - * be stably zero. An example unstable zero can occur if the call - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, - * but due to task migration, sees the corresponding __srcu_read_unlock() - * decrement. This can happen because srcu_readers_active_idx() takes - * time to sum the array, and might in fact be interrupted or preempted - * partway through the summation. + * be zero. */ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) { - unsigned long seq; + unsigned long unlocks; - seq = srcu_readers_seq_idx(sp, idx); + unlocks = srcu_readers_unlock_idx(sp, idx); /* - * The following smp_mb() A pairs with the smp_mb() B located in - * __srcu_read_lock(). This pairing ensures that if an - * __srcu_read_lock() increments its counter after the summation - * in srcu_readers_active_idx(), then the corresponding SRCU read-side - * critical section will see any changes made prior to the start - * of the current SRCU grace period. + * Make sure that a lock is always counted if the corresponding unlock + * is counted. Needs to be a smp_mb() as the read side may contain a + * read from a variable that is written to before the synchronize_srcu() + * in the write side. In this case smp_mb()s A and B act like the store + * buffering pattern. * - * Also, if the above call to srcu_readers_seq_idx() saw the - * increment of ->seq[], then the call to srcu_readers_active_idx() - * must see the increment of ->c[]. + * This smp_mb() also pairs with smp_mb() C to prevent accesses after the + * synchronize_srcu() from being executed before the grace period ends. */ smp_mb(); /* A */ /* - * Note that srcu_readers_active_idx() can incorrectly return - * zero even though there is a pre-existing reader throughout. - * To see this, suppose that task A is in a very long SRCU - * read-side critical section that started on CPU 0, and that - * no other reader exists, so that the sum of the counters - * is equal to one. Then suppose that task B starts executing - * srcu_readers_active_idx(), summing up to CPU 1, and then that - * task C starts reading on CPU 0, so that its increment is not - * summed, but finishes reading on CPU 2, so that its decrement - * -is- summed. Then when task B completes its sum, it will - * incorrectly get zero, despite the fact that task A has been - * in its SRCU read-side critical section the whole time. - * - * We therefore do a validation step should srcu_readers_active_idx() - * return zero. - */ - if (srcu_readers_active_idx(sp, idx) != 0) - return false; - - /* - * The remainder of this function is the validation step. - * The following smp_mb() D pairs with the smp_mb() C in - * __srcu_read_unlock(). If the __srcu_read_unlock() was seen - * by srcu_readers_active_idx() above, then any destructive - * operation performed after the grace period will happen after - * the corresponding SRCU read-side critical section. + * If the locks are the same as the unlocks, then there must have + * been no readers on this index at some time in between. This does not + * mean that there are no more readers, as one could have read the + * current index but not have incremented the lock counter yet. * - * Note that there can be at most NR_CPUS worth of readers using - * the old index, which is not enough to overflow even a 32-bit - * integer. (Yes, this does mean that systems having more than - * a billion or so CPUs need to be 64-bit systems.) Therefore, - * the sum of the ->seq[] counters cannot possibly overflow. - * Therefore, the only way that the return values of the two - * calls to srcu_readers_seq_idx() can be equal is if there were - * no increments of the corresponding rank of ->seq[] counts - * in the interim. But the missed-increment scenario laid out - * above includes an increment of the ->seq[] counter by - * the corresponding __srcu_read_lock(). Therefore, if this - * scenario occurs, the return values from the two calls to - * srcu_readers_seq_idx() will differ, and thus the validation - * step below suffices. + * Possible bug: There is no guarantee that there haven't been ULONG_MAX + * increments of ->lock_count[] since the unlocks were counted, meaning + * that this could return true even if there are still active readers. + * Since there are no memory barriers around srcu_flip(), the CPU is not + * required to increment ->completed before running + * srcu_readers_unlock_idx(), which means that there could be an + * arbitrarily large number of critical sections that execute after + * srcu_readers_unlock_idx() but use the old value of ->completed. */ - smp_mb(); /* D */ - - return srcu_readers_seq_idx(sp, idx) == seq; + return srcu_readers_lock_idx(sp, idx) == unlocks; } /** @@ -266,8 +229,12 @@ static bool srcu_readers_active(struct srcu_struct *sp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->lock_count[0]); + sum += READ_ONCE(cpuc->lock_count[1]); + sum -= READ_ONCE(cpuc->unlock_count[0]); + sum -= READ_ONCE(cpuc->unlock_count[1]); } return sum; } @@ -276,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp) * cleanup_srcu_struct - deconstruct a sleep-RCU structure * @sp: structure to clean up. * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. + * Must invoke this only after you are finished using a given srcu_struct + * that was initialized via init_srcu_struct(). This code does some + * probabalistic checking, spotting late uses of srcu_read_lock(), + * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). + * If any such late uses are detected, the per-CPU memory associated with + * the srcu_struct is simply leaked and WARN_ON() is invoked. If the + * caller frees the srcu_struct itself, a use-after-free crash will likely + * ensue, but at least there will be a warning printed. */ void cleanup_srcu_struct(struct srcu_struct *sp) { @@ -298,9 +271,8 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->completed) & 0x1; - __this_cpu_inc(sp->per_cpu_ref->c[idx]); + __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - __this_cpu_inc(sp->per_cpu_ref->seq[idx]); return idx; } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -314,7 +286,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *sp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_dec(sp->per_cpu_ref->c[idx]); + this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -349,12 +321,21 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) /* * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->c[] and ->seq[] arrays. This allows + * use the other rank of the ->(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *sp) { - sp->completed++; + WRITE_ONCE(sp->completed, sp->completed + 1); + + /* + * Ensure that if the updater misses an __srcu_read_unlock() + * increment, that task's next __srcu_read_lock() will see the + * above counter update. Note that both this memory barrier + * and the one in srcu_readers_active_idx_check() provide the + * guarantee for __srcu_read_lock(). + */ + smp_mb(); /* D */ /* Pairs with C. */ } /* @@ -392,6 +373,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, head->next = NULL; head->func = func; spin_lock_irqsave(&sp->queue_lock, flags); + smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; @@ -425,6 +407,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) head->next = NULL; head->func = wakeme_after_rcu; spin_lock_irq(&sp->queue_lock); + smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ if (!sp->running) { /* steal the processing owner */ sp->running = true; @@ -444,8 +427,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) spin_unlock_irq(&sp->queue_lock); } - if (!done) + if (!done) { wait_for_completion(&rcu.completion); + smp_mb(); /* Caller's later accesses after GP. */ + } + } /** @@ -613,7 +599,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount) /* * Invoke a limited number of SRCU callbacks that have passed through * their grace period. If there are more to do, SRCU will reschedule - * the workqueue. + * the workqueue. Note that needed memory barriers have been executed + * in this task's context by srcu_readers_active_idx_check(). */ static void srcu_invoke_callbacks(struct srcu_struct *sp) { diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c new file mode 100644 index 000000000000..36e1f82faed1 --- /dev/null +++ b/kernel/rcu/srcutiny.c @@ -0,0 +1,216 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tiny version for non-preemptible single-CPU use. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney <paulmck@us.ibm.com> + */ + +#include <linux/export.h> +#include <linux/mutex.h> +#include <linux/preempt.h> +#include <linux/rcupdate_wait.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/srcu.h> + +#include <linux/rcu_node_tree.h> +#include "rcu_segcblist.h" +#include "rcu.h" + +static int init_srcu_struct_fields(struct srcu_struct *sp) +{ + sp->srcu_lock_nesting[0] = 0; + sp->srcu_lock_nesting[1] = 0; + init_swait_queue_head(&sp->srcu_wq); + sp->srcu_gp_seq = 0; + rcu_segcblist_init(&sp->srcu_cblist); + sp->srcu_gp_running = false; + sp->srcu_gp_waiting = false; + sp->srcu_idx = 0; + INIT_WORK(&sp->srcu_work, srcu_drive_gp); + return 0; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int __init_srcu_struct(struct srcu_struct *sp, const char *name, + struct lock_class_key *key) +{ + /* Don't re-initialize a lock while it is held. */ + debug_check_no_locks_freed((void *)sp, sizeof(*sp)); + lockdep_init_map(&sp->dep_map, name, key, 0); + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(init_srcu_struct); + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); + flush_work(&sp->srcu_work); + WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); + WARN_ON(sp->srcu_gp_running); + WARN_ON(sp->srcu_gp_waiting); + WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); +} +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Must be called from process context. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int __srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + idx = READ_ONCE(sp->srcu_idx); + WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); + return idx; +} +EXPORT_SYMBOL_GPL(__srcu_read_lock); + +/* + * Removes the count for the old reader from the appropriate element of + * the srcu_struct. Must be called from process context. + */ +void __srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + int newval = sp->srcu_lock_nesting[idx] - 1; + + WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); + if (!newval && READ_ONCE(sp->srcu_gp_waiting)) + swake_up(&sp->srcu_wq); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock); + +/* + * Workqueue handler to drive one grace period and invoke any callbacks + * that become ready as a result. Single-CPU and !PREEMPT operation + * means that we get away with murder on synchronization. ;-) + */ +void srcu_drive_gp(struct work_struct *wp) +{ + int idx; + struct rcu_cblist ready_cbs; + struct srcu_struct *sp; + struct rcu_head *rhp; + + sp = container_of(wp, struct srcu_struct, srcu_work); + if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) + return; /* Already running or nothing to do. */ + + /* Tag recently arrived callbacks and wait for readers. */ + WRITE_ONCE(sp->srcu_gp_running, true); + rcu_segcblist_accelerate(&sp->srcu_cblist, + rcu_seq_snap(&sp->srcu_gp_seq)); + rcu_seq_start(&sp->srcu_gp_seq); + idx = sp->srcu_idx; + WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); + WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ + swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); + WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + rcu_seq_end(&sp->srcu_gp_seq); + + /* Update callback list based on GP, and invoke ready callbacks. */ + rcu_segcblist_advance(&sp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { + rcu_cblist_init(&ready_cbs); + local_irq_disable(); + rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs); + local_irq_enable(); + rhp = rcu_cblist_dequeue(&ready_cbs); + for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + } + local_irq_disable(); + rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs); + local_irq_enable(); + } + WRITE_ONCE(sp->srcu_gp_running, false); + + /* + * If more callbacks, reschedule ourselves. This can race with + * a call_srcu() at interrupt level, but the ->srcu_gp_running + * checks will straighten that out. + */ + if (!rcu_segcblist_empty(&sp->srcu_cblist)) + schedule_work(&sp->srcu_work); +} +EXPORT_SYMBOL_GPL(srcu_drive_gp); + +/* + * Enqueue an SRCU callback on the specified srcu_struct structure, + * initiating grace-period processing if it is not already running. + */ +void call_srcu(struct srcu_struct *sp, struct rcu_head *head, + rcu_callback_t func) +{ + unsigned long flags; + + head->func = func; + local_irq_save(flags); + rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); + local_irq_restore(flags); + if (!READ_ONCE(sp->srcu_gp_running)) + schedule_work(&sp->srcu_work); +} +EXPORT_SYMBOL_GPL(call_srcu); + +/* + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + struct rcu_synchronize rs; + + init_rcu_head_on_stack(&rs.head); + init_completion(&rs.completion); + call_srcu(sp, &rs.head, wakeme_after_rcu); + wait_for_completion(&rs.completion); + destroy_rcu_head_on_stack(&rs.head); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c new file mode 100644 index 000000000000..3ae8474557df --- /dev/null +++ b/kernel/rcu/srcutree.c @@ -0,0 +1,1155 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2006 + * Copyright (C) Fujitsu, 2012 + * + * Author: Paul McKenney <paulmck@us.ibm.com> + * Lai Jiangshan <laijs@cn.fujitsu.com> + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ + +#include <linux/export.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/rcupdate_wait.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/srcu.h> + +#include "rcu.h" +#include "rcu_segcblist.h" + +ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ +module_param(exp_holdoff, ulong, 0444); + +static void srcu_invoke_callbacks(struct work_struct *work); +static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); + +/* + * Initialize SRCU combining tree. Note that statically allocated + * srcu_struct structures might already have srcu_read_lock() and + * srcu_read_unlock() running against them. So if the is_static parameter + * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + */ +static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) +{ + int cpu; + int i; + int level = 0; + int levelspread[RCU_NUM_LVLS]; + struct srcu_data *sdp; + struct srcu_node *snp; + struct srcu_node *snp_first; + + /* Work out the overall tree geometry. */ + sp->level[0] = &sp->node[0]; + for (i = 1; i < rcu_num_lvls; i++) + sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1]; + rcu_init_levelspread(levelspread, num_rcu_lvl); + + /* Each pass through this loop initializes one srcu_node structure. */ + rcu_for_each_node_breadth_first(sp, snp) { + spin_lock_init(&snp->lock); + WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != + ARRAY_SIZE(snp->srcu_data_have_cbs)); + for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { + snp->srcu_have_cbs[i] = 0; + snp->srcu_data_have_cbs[i] = 0; + } + snp->srcu_gp_seq_needed_exp = 0; + snp->grplo = -1; + snp->grphi = -1; + if (snp == &sp->node[0]) { + /* Root node, special case. */ + snp->srcu_parent = NULL; + continue; + } + + /* Non-root node. */ + if (snp == sp->level[level + 1]) + level++; + snp->srcu_parent = sp->level[level - 1] + + (snp - sp->level[level]) / + levelspread[level - 1]; + } + + /* + * Initialize the per-CPU srcu_data array, which feeds into the + * leaves of the srcu_node tree. + */ + WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + ARRAY_SIZE(sdp->srcu_unlock_count)); + level = rcu_num_lvls - 1; + snp_first = sp->level[level]; + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(sp->sda, cpu); + spin_lock_init(&sdp->lock); + rcu_segcblist_init(&sdp->srcu_cblist); + sdp->srcu_cblist_invoking = false; + sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq; + sdp->mynode = &snp_first[cpu / levelspread[level]]; + for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { + if (snp->grplo < 0) + snp->grplo = cpu; + snp->grphi = cpu; + } + sdp->cpu = cpu; + INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); + sdp->sp = sp; + sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); + if (is_static) + continue; + + /* Dynamically allocated, better be no srcu_read_locks()! */ + for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) { + sdp->srcu_lock_count[i] = 0; + sdp->srcu_unlock_count[i] = 0; + } + } +} + +/* + * Initialize non-compile-time initialized fields, including the + * associated srcu_node and srcu_data structures. The is_static + * parameter is passed through to init_srcu_struct_nodes(), and + * also tells us that ->sda has already been wired up to srcu_data. + */ +static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static) +{ + mutex_init(&sp->srcu_cb_mutex); + mutex_init(&sp->srcu_gp_mutex); + sp->srcu_idx = 0; + sp->srcu_gp_seq = 0; + sp->srcu_barrier_seq = 0; + mutex_init(&sp->srcu_barrier_mutex); + atomic_set(&sp->srcu_barrier_cpu_cnt, 0); + INIT_DELAYED_WORK(&sp->work, process_srcu); + if (!is_static) + sp->sda = alloc_percpu(struct srcu_data); + init_srcu_struct_nodes(sp, is_static); + sp->srcu_gp_seq_needed_exp = 0; + sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */ + return sp->sda ? 0 : -ENOMEM; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int __init_srcu_struct(struct srcu_struct *sp, const char *name, + struct lock_class_key *key) +{ + /* Don't re-initialize a lock while it is held. */ + debug_check_no_locks_freed((void *)sp, sizeof(*sp)); + lockdep_init_map(&sp->dep_map, name, key, 0); + spin_lock_init(&sp->gp_lock); + return init_srcu_struct_fields(sp, false); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/** + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + spin_lock_init(&sp->gp_lock); + return init_srcu_struct_fields(sp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct); + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * First-use initialization of statically allocated srcu_struct + * structure. Wiring up the combining tree is more than can be + * done with compile-time initialization, so this check is added + * to each update-side SRCU primitive. Use ->gp_lock, which -is- + * compile-time initialized, to resolve races involving multiple + * CPUs trying to garner first-use privileges. + */ +static void check_init_srcu_struct(struct srcu_struct *sp) +{ + unsigned long flags; + + WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT); + /* The smp_load_acquire() pairs with the smp_store_release(). */ + if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ + return; /* Already initialized. */ + spin_lock_irqsave(&sp->gp_lock, flags); + if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { + spin_unlock_irqrestore(&sp->gp_lock, flags); + return; + } + init_srcu_struct_fields(sp, true); + spin_unlock_irqrestore(&sp->gp_lock, flags); +} + +/* + * Returns approximate total of the readers' ->srcu_lock_count[] values + * for the rank of per-CPU counters specified by idx. + */ +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_lock_count[idx]); + } + return sum; +} + +/* + * Returns approximate total of the readers' ->srcu_unlock_count[] values + * for the rank of per-CPU counters specified by idx. + */ +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); + } + return sum; +} + +/* + * Return true if the number of pre-existing readers is determined to + * be zero. + */ +static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) +{ + unsigned long unlocks; + + unlocks = srcu_readers_unlock_idx(sp, idx); + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. Needs to be a smp_mb() as the read side may + * contain a read from a variable that is written to before the + * synchronize_srcu() in the write side. In this case smp_mb()s + * A and B act like the store buffering pattern. + * + * This smp_mb() also pairs with smp_mb() C to prevent accesses + * after the synchronize_srcu() from being executed before the + * grace period ends. + */ + smp_mb(); /* A */ + + /* + * If the locks are the same as the unlocks, then there must have + * been no readers on this index at some time in between. This does + * not mean that there are no more readers, as one could have read + * the current index but not have incremented the lock counter yet. + * + * Possible bug: There is no guarantee that there haven't been + * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were + * counted, meaning that this could return true even if there are + * still active readers. Since there are no memory barriers around + * srcu_flip(), the CPU is not required to increment ->srcu_idx + * before running srcu_readers_unlock_idx(), which means that there + * could be an arbitrarily large number of critical sections that + * execute after srcu_readers_unlock_idx() but use the old value + * of ->srcu_idx. + */ + return srcu_readers_lock_idx(sp, idx) == unlocks; +} + +/** + * srcu_readers_active - returns true if there are readers. and false + * otherwise + * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * + * Note that this is not an atomic primitive, and can therefore suffer + * severe errors when invoked on an active srcu_struct. That said, it + * can be useful as an error check at cleanup time. + */ +static bool srcu_readers_active(struct srcu_struct *sp) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_lock_count[0]); + sum += READ_ONCE(cpuc->srcu_lock_count[1]); + sum -= READ_ONCE(cpuc->srcu_unlock_count[0]); + sum -= READ_ONCE(cpuc->srcu_unlock_count[1]); + } + return sum; +} + +#define SRCU_INTERVAL 1 + +/* + * Return grace-period delay, zero if there are expedited grace + * periods pending, SRCU_INTERVAL otherwise. + */ +static unsigned long srcu_get_delay(struct srcu_struct *sp) +{ + if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq), + READ_ONCE(sp->srcu_gp_seq_needed_exp))) + return 0; + return SRCU_INTERVAL; +} + +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + int cpu; + + if (WARN_ON(!srcu_get_delay(sp))) + return; /* Leakage unless caller handles error. */ + if (WARN_ON(srcu_readers_active(sp))) + return; /* Leakage unless caller handles error. */ + flush_delayed_work(&sp->work); + for_each_possible_cpu(cpu) + flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(srcu_readers_active(sp))) { + pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + return; /* Caller forgot to stop doing call_srcu()? */ + } + free_percpu(sp->sda); + sp->sda = NULL; +} +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Must be called from process context. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int __srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + idx = READ_ONCE(sp->srcu_idx) & 0x1; + __this_cpu_inc(sp->sda->srcu_lock_count[idx]); + smp_mb(); /* B */ /* Avoid leaking the critical section. */ + return idx; +} +EXPORT_SYMBOL_GPL(__srcu_read_lock); + +/* + * Removes the count for the old reader from the appropriate per-CPU + * element of the srcu_struct. Note that this may well be a different + * CPU than that which was incremented by the corresponding srcu_read_lock(). + * Must be called from process context. + */ +void __srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + smp_mb(); /* C */ /* Avoid leaking the critical section. */ + this_cpu_inc(sp->sda->srcu_unlock_count[idx]); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock); + +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below) to allow SRCU readers to exit their read-side critical + * sections. If there are still some readers after a few microseconds, + * we repeatedly block for 1-millisecond time periods. + */ +#define SRCU_RETRY_CHECK_DELAY 5 + +/* + * Start an SRCU grace period. + */ +static void srcu_gp_start(struct srcu_struct *sp) +{ + struct srcu_data *sdp = this_cpu_ptr(sp->sda); + int state; + + RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), + "Invoked srcu_gp_start() without ->gp_lock!"); + WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, + rcu_seq_snap(&sp->srcu_gp_seq)); + smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ + rcu_seq_start(&sp->srcu_gp_seq); + state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + WARN_ON_ONCE(state != SRCU_STATE_SCAN1); +} + +/* + * Track online CPUs to guide callback workqueue placement. + */ +DEFINE_PER_CPU(bool, srcu_online); + +void srcu_online_cpu(unsigned int cpu) +{ + WRITE_ONCE(per_cpu(srcu_online, cpu), true); +} + +void srcu_offline_cpu(unsigned int cpu) +{ + WRITE_ONCE(per_cpu(srcu_online, cpu), false); +} + +/* + * Place the workqueue handler on the specified CPU if online, otherwise + * just run it whereever. This is useful for placing workqueue handlers + * that are to invoke the specified CPU's callbacks. + */ +static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + bool ret; + + preempt_disable(); + if (READ_ONCE(per_cpu(srcu_online, cpu))) + ret = queue_delayed_work_on(cpu, wq, dwork, delay); + else + ret = queue_delayed_work(wq, dwork, delay); + preempt_enable(); + return ret; +} + +/* + * Schedule callback invocation for the specified srcu_data structure, + * if possible, on the corresponding CPU. + */ +static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) +{ + srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, + &sdp->work, delay); +} + +/* + * Schedule callback invocation for all srcu_data structures associated + * with the specified srcu_node structure that have callbacks for the + * just-completed grace period, the one corresponding to idx. If possible, + * schedule this invocation on the corresponding CPUs. + */ +static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, + unsigned long mask, unsigned long delay) +{ + int cpu; + + for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { + if (!(mask & (1 << (cpu - snp->grplo)))) + continue; + srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay); + } +} + +/* + * Note the end of an SRCU grace period. Initiates callback invocation + * and starts a new grace period if needed. + * + * The ->srcu_cb_mutex acquisition does not protect any data, but + * instead prevents more than one grace period from starting while we + * are initiating callback invocation. This allows the ->srcu_have_cbs[] + * array to have a finite number of elements. + */ +static void srcu_gp_end(struct srcu_struct *sp) +{ + unsigned long cbdelay; + bool cbs; + unsigned long gpseq; + int idx; + int idxnext; + unsigned long mask; + struct srcu_node *snp; + + /* Prevent more than one additional grace period. */ + mutex_lock(&sp->srcu_cb_mutex); + + /* End the current grace period. */ + spin_lock_irq(&sp->gp_lock); + idx = rcu_seq_state(sp->srcu_gp_seq); + WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); + cbdelay = srcu_get_delay(sp); + sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + rcu_seq_end(&sp->srcu_gp_seq); + gpseq = rcu_seq_current(&sp->srcu_gp_seq); + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) + sp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irq(&sp->gp_lock); + mutex_unlock(&sp->srcu_gp_mutex); + /* A new grace period can start at this point. But only one. */ + + /* Initiate callback invocation as needed. */ + idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); + idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); + rcu_for_each_node_breadth_first(sp, snp) { + spin_lock_irq(&snp->lock); + cbs = false; + if (snp >= sp->level[rcu_num_lvls - 1]) + cbs = snp->srcu_have_cbs[idx] == gpseq; + snp->srcu_have_cbs[idx] = gpseq; + rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); + if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) + snp->srcu_gp_seq_needed_exp = gpseq; + mask = snp->srcu_data_have_cbs[idx]; + snp->srcu_data_have_cbs[idx] = 0; + spin_unlock_irq(&snp->lock); + if (cbs) { + smp_mb(); /* GP end before CB invocation. */ + srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); + } + } + + /* Callback initiation done, allow grace periods after next. */ + mutex_unlock(&sp->srcu_cb_mutex); + + /* Start a new grace period if needed. */ + spin_lock_irq(&sp->gp_lock); + gpseq = rcu_seq_current(&sp->srcu_gp_seq); + if (!rcu_seq_state(gpseq) && + ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { + srcu_gp_start(sp); + spin_unlock_irq(&sp->gp_lock); + /* Throttle expedited grace periods: Should be rare! */ + srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff + ? 0 : SRCU_INTERVAL); + } else { + spin_unlock_irq(&sp->gp_lock); + } +} + +/* + * Funnel-locking scheme to scalably mediate many concurrent expedited + * grace-period requests. This function is invoked for the first known + * expedited request for a grace period that has already been requested, + * but without expediting. To start a completely new grace period, + * whether expedited or not, use srcu_funnel_gp_start() instead. + */ +static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, + unsigned long s) +{ + unsigned long flags; + + for (; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&sp->srcu_gp_seq, s) || + ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) + return; + spin_lock_irqsave(&snp->lock, flags); + if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + spin_unlock_irqrestore(&snp->lock, flags); + return; + } + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore(&snp->lock, flags); + } + spin_lock_irqsave(&sp->gp_lock, flags); + if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) + sp->srcu_gp_seq_needed_exp = s; + spin_unlock_irqrestore(&sp->gp_lock, flags); +} + +/* + * Funnel-locking scheme to scalably mediate many concurrent grace-period + * requests. The winner has to do the work of actually starting grace + * period s. Losers must either ensure that their desired grace-period + * number is recorded on at least their leaf srcu_node structure, or they + * must take steps to invoke their own callbacks. + */ +static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, + unsigned long s, bool do_norm) +{ + unsigned long flags; + int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); + struct srcu_node *snp = sdp->mynode; + unsigned long snp_seq; + + /* Each pass through the loop does one level of the srcu_node tree. */ + for (; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) + return; /* GP already done and CBs recorded. */ + spin_lock_irqsave(&snp->lock, flags); + if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { + snp_seq = snp->srcu_have_cbs[idx]; + if (snp == sdp->mynode && snp_seq == s) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + spin_unlock_irqrestore(&snp->lock, flags); + if (snp == sdp->mynode && snp_seq != s) { + smp_mb(); /* CBs after GP! */ + srcu_schedule_cbs_sdp(sdp, do_norm + ? SRCU_INTERVAL + : 0); + return; + } + if (!do_norm) + srcu_funnel_exp_start(sp, snp, s); + return; + } + snp->srcu_have_cbs[idx] = s; + if (snp == sdp->mynode) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) + snp->srcu_gp_seq_needed_exp = s; + spin_unlock_irqrestore(&snp->lock, flags); + } + + /* Top of tree, must ensure the grace period will be started. */ + spin_lock_irqsave(&sp->gp_lock, flags); + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { + /* + * Record need for grace period s. Pair with load + * acquire setting up for initialization. + */ + smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/ + } + if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) + sp->srcu_gp_seq_needed_exp = s; + + /* If grace period not already done and none in progress, start it. */ + if (!rcu_seq_done(&sp->srcu_gp_seq, s) && + rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { + WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + srcu_gp_start(sp); + queue_delayed_work(system_power_efficient_wq, &sp->work, + srcu_get_delay(sp)); + } + spin_unlock_irqrestore(&sp->gp_lock, flags); +} + +/* + * Wait until all readers counted by array index idx complete, but + * loop an additional time if there is an expedited grace period pending. + * The caller must ensure that ->srcu_idx is not changed while checking. + */ +static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) +{ + for (;;) { + if (srcu_readers_active_idx_check(sp, idx)) + return true; + if (--trycount + !srcu_get_delay(sp) <= 0) + return false; + udelay(SRCU_RETRY_CHECK_DELAY); + } +} + +/* + * Increment the ->srcu_idx counter so that future SRCU readers will + * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows + * us to wait for pre-existing readers in a starvation-free manner. + */ +static void srcu_flip(struct srcu_struct *sp) +{ + WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); + + /* + * Ensure that if the updater misses an __srcu_read_unlock() + * increment, that task's next __srcu_read_lock() will see the + * above counter update. Note that both this memory barrier + * and the one in srcu_readers_active_idx_check() provide the + * guarantee for __srcu_read_lock(). + */ + smp_mb(); /* D */ /* Pairs with C. */ +} + +/* + * If SRCU is likely idle, return true, otherwise return false. + * + * Note that it is OK for several current from-idle requests for a new + * grace period from idle to specify expediting because they will all end + * up requesting the same grace period anyhow. So no loss. + * + * Note also that if any CPU (including the current one) is still invoking + * callbacks, this function will nevertheless say "idle". This is not + * ideal, but the overhead of checking all CPUs' callback lists is even + * less ideal, especially on large systems. Furthermore, the wakeup + * can happen before the callback is fully removed, so we have no choice + * but to accept this type of error. + * + * This function is also subject to counter-wrap errors, but let's face + * it, if this function was preempted for enough time for the counters + * to wrap, it really doesn't matter whether or not we expedite the grace + * period. The extra overhead of a needlessly expedited grace period is + * negligible when amoritized over that time period, and the extra latency + * of a needlessly non-expedited grace period is similarly negligible. + */ +static bool srcu_might_be_idle(struct srcu_struct *sp) +{ + unsigned long curseq; + unsigned long flags; + struct srcu_data *sdp; + unsigned long t; + + /* If the local srcu_data structure has callbacks, not idle. */ + local_irq_save(flags); + sdp = this_cpu_ptr(sp->sda); + if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { + local_irq_restore(flags); + return false; /* Callbacks already present, so not idle. */ + } + local_irq_restore(flags); + + /* + * No local callbacks, so probabalistically probe global state. + * Exact information would require acquiring locks, which would + * kill scalability, hence the probabalistic nature of the probe. + */ + + /* First, see if enough time has passed since the last GP. */ + t = ktime_get_mono_fast_ns(); + if (exp_holdoff == 0 || + time_in_range_open(t, sp->srcu_last_gp_end, + sp->srcu_last_gp_end + exp_holdoff)) + return false; /* Too soon after last GP. */ + + /* Next, check for probable idleness. */ + curseq = rcu_seq_current(&sp->srcu_gp_seq); + smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ + if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed))) + return false; /* Grace period in progress, so not idle. */ + smp_mb(); /* Order ->srcu_gp_seq with prior access. */ + if (curseq != rcu_seq_current(&sp->srcu_gp_seq)) + return false; /* GP # changed, so not idle. */ + return true; /* With reasonable probability, idle! */ +} + +/* + * Enqueue an SRCU callback on the srcu_data structure associated with + * the current CPU and the specified srcu_struct structure, initiating + * grace-period processing if it is not already running. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing SRCU read-side critical section. On systems with + * more than one CPU, this means that when "func()" is invoked, each CPU + * is guaranteed to have executed a full memory barrier since the end of + * its last corresponding SRCU read-side critical section whose beginning + * preceded the call to call_rcu(). It also means that each CPU executing + * an SRCU read-side critical section that continues beyond the start of + * "func()" must have executed a memory barrier after the call_rcu() + * but before the beginning of that SRCU read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting SRCU callback function "func()", then both CPU A and CPU + * B are guaranteed to execute a full memory barrier during the time + * interval between the call to call_rcu() and the invocation of "func()". + * This guarantee applies even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + * + * Of course, these guarantees apply only for invocations of call_srcu(), + * srcu_read_lock(), and srcu_read_unlock() that are all passed the same + * srcu_struct structure. + */ +void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, + rcu_callback_t func, bool do_norm) +{ + unsigned long flags; + bool needexp = false; + bool needgp = false; + unsigned long s; + struct srcu_data *sdp; + + check_init_srcu_struct(sp); + rhp->func = func; + local_irq_save(flags); + sdp = this_cpu_ptr(sp->sda); + spin_lock(&sdp->lock); + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + s = rcu_seq_snap(&sp->srcu_gp_seq); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { + sdp->srcu_gp_seq_needed = s; + needgp = true; + } + if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { + sdp->srcu_gp_seq_needed_exp = s; + needexp = true; + } + spin_unlock_irqrestore(&sdp->lock, flags); + if (needgp) + srcu_funnel_gp_start(sp, sdp, s, do_norm); + else if (needexp) + srcu_funnel_exp_start(sp, sdp->mynode, s); +} + +void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, + rcu_callback_t func) +{ + __call_srcu(sp, rhp, func, true); +} +EXPORT_SYMBOL_GPL(call_srcu); + +/* + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). + */ +static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) +{ + struct rcu_synchronize rcu; + + RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); + + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return; + might_sleep(); + check_init_srcu_struct(sp); + init_completion(&rcu.completion); + init_rcu_head_on_stack(&rcu.head); + __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} + +/** + * synchronize_srcu_expedited - Brute-force SRCU grace period + * @sp: srcu_struct with which to synchronize. + * + * Wait for an SRCU grace period to elapse, but be more aggressive about + * spinning rather than blocking when waiting. + * + * Note that synchronize_srcu_expedited() has the same deadlock and + * memory-ordering properties as does synchronize_srcu(). + */ +void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, rcu_gp_is_normal()); +} +EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); + +/** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Wait for the count to drain to zero of both indexes. To avoid the + * possible starvation of synchronize_srcu(), it waits for the count of + * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, + * and then flip the srcu_idx and wait for the count of the other index. + * + * Can block; must be called from process context. + * + * Note that it is illegal to call synchronize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section, + * as long as the resulting graph of srcu_structs is acyclic. + * + * There are memory-ordering constraints implied by synchronize_srcu(). + * On systems with more than one CPU, when synchronize_srcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last corresponding SRCU-sched read-side critical section + * whose beginning preceded the call to synchronize_srcu(). In addition, + * each CPU having an SRCU read-side critical section that extends beyond + * the return from synchronize_srcu() is guaranteed to have executed a + * full memory barrier after the beginning of synchronize_srcu() and before + * the beginning of that SRCU read-side critical section. Note that these + * guarantees include CPUs that are offline, idle, or executing in user mode, + * as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_srcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_srcu(). This guarantee applies even if CPU A and CPU B + * are the same CPU, but again only if the system has more than one CPU. + * + * Of course, these memory-ordering guarantees apply only when + * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are + * passed the same srcu_struct structure. + * + * If SRCU is likely idle, expedite the first request. This semantic + * was provided by Classic SRCU, and is relied upon by its users, so TREE + * SRCU must also provide it. Note that detecting idleness is heuristic + * and subject to both false positives and negatives. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + if (srcu_might_be_idle(sp) || rcu_gp_is_expedited()) + synchronize_srcu_expedited(sp); + else + __synchronize_srcu(sp, true); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); + +/* + * Callback function for srcu_barrier() use. + */ +static void srcu_barrier_cb(struct rcu_head *rhp) +{ + struct srcu_data *sdp; + struct srcu_struct *sp; + + sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); + sp = sdp->sp; + if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) + complete(&sp->srcu_barrier_completion); +} + +/** + * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. + * @sp: srcu_struct on which to wait for in-flight callbacks. + */ +void srcu_barrier(struct srcu_struct *sp) +{ + int cpu; + struct srcu_data *sdp; + unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq); + + check_init_srcu_struct(sp); + mutex_lock(&sp->srcu_barrier_mutex); + if (rcu_seq_done(&sp->srcu_barrier_seq, s)) { + smp_mb(); /* Force ordering following return. */ + mutex_unlock(&sp->srcu_barrier_mutex); + return; /* Someone else did our work for us. */ + } + rcu_seq_start(&sp->srcu_barrier_seq); + init_completion(&sp->srcu_barrier_completion); + + /* Initial count prevents reaching zero until all CBs are posted. */ + atomic_set(&sp->srcu_barrier_cpu_cnt, 1); + + /* + * Each pass through this loop enqueues a callback, but only + * on CPUs already having callbacks enqueued. Note that if + * a CPU already has callbacks enqueue, it must have already + * registered the need for a future grace period, so all we + * need do is enqueue a callback that will use the same + * grace period as the last callback already in the queue. + */ + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(sp->sda, cpu); + spin_lock_irq(&sdp->lock); + atomic_inc(&sp->srcu_barrier_cpu_cnt); + sdp->srcu_barrier_head.func = srcu_barrier_cb; + if (!rcu_segcblist_entrain(&sdp->srcu_cblist, + &sdp->srcu_barrier_head, 0)) + atomic_dec(&sp->srcu_barrier_cpu_cnt); + spin_unlock_irq(&sdp->lock); + } + + /* Remove the initial count, at which point reaching zero can happen. */ + if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) + complete(&sp->srcu_barrier_completion); + wait_for_completion(&sp->srcu_barrier_completion); + + rcu_seq_end(&sp->srcu_barrier_seq); + mutex_unlock(&sp->srcu_barrier_mutex); +} +EXPORT_SYMBOL_GPL(srcu_barrier); + +/** + * srcu_batches_completed - return batches completed. + * @sp: srcu_struct on which to report batch completion. + * + * Report the number of batches, correlated with, but not necessarily + * precisely the same as, the number of grace periods that have elapsed. + */ +unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return sp->srcu_idx; +} +EXPORT_SYMBOL_GPL(srcu_batches_completed); + +/* + * Core SRCU state machine. Push state bits of ->srcu_gp_seq + * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has + * completed in that state. + */ +static void srcu_advance_state(struct srcu_struct *sp) +{ + int idx; + + mutex_lock(&sp->srcu_gp_mutex); + + /* + * Because readers might be delayed for an extended period after + * fetching ->srcu_idx for their index, at any point in time there + * might well be readers using both idx=0 and idx=1. We therefore + * need to wait for readers to clear from both index values before + * invoking a callback. + * + * The load-acquire ensures that we see the accesses performed + * by the prior grace period. + */ + idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ + if (idx == SRCU_STATE_IDLE) { + spin_lock_irq(&sp->gp_lock); + if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); + spin_unlock_irq(&sp->gp_lock); + mutex_unlock(&sp->srcu_gp_mutex); + return; + } + idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + if (idx == SRCU_STATE_IDLE) + srcu_gp_start(sp); + spin_unlock_irq(&sp->gp_lock); + if (idx != SRCU_STATE_IDLE) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* Someone else started the grace period. */ + } + } + + if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + idx = 1 ^ (sp->srcu_idx & 1); + if (!try_check_zero(sp, idx, 1)) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* readers present, retry later. */ + } + srcu_flip(sp); + rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2); + } + + if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + + /* + * SRCU read-side critical sections are normally short, + * so check at least twice in quick succession after a flip. + */ + idx = 1 ^ (sp->srcu_idx & 1); + if (!try_check_zero(sp, idx, 2)) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* readers present, retry later. */ + } + srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */ + } +} + +/* + * Invoke a limited number of SRCU callbacks that have passed through + * their grace period. If there are more to do, SRCU will reschedule + * the workqueue. Note that needed memory barriers have been executed + * in this task's context by srcu_readers_active_idx_check(). + */ +static void srcu_invoke_callbacks(struct work_struct *work) +{ + bool more; + struct rcu_cblist ready_cbs; + struct rcu_head *rhp; + struct srcu_data *sdp; + struct srcu_struct *sp; + + sdp = container_of(work, struct srcu_data, work.work); + sp = sdp->sp; + rcu_cblist_init(&ready_cbs); + spin_lock_irq(&sdp->lock); + smp_mb(); /* Old grace periods before callback invocation! */ + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + if (sdp->srcu_cblist_invoking || + !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { + spin_unlock_irq(&sdp->lock); + return; /* Someone else on the job or nothing to do. */ + } + + /* We are on the job! Extract and invoke ready callbacks. */ + sdp->srcu_cblist_invoking = true; + rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); + spin_unlock_irq(&sdp->lock); + rhp = rcu_cblist_dequeue(&ready_cbs); + for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + } + + /* + * Update counts, accelerate new callbacks, and if needed, + * schedule another round of callback invocation. + */ + spin_lock_irq(&sdp->lock); + rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, + rcu_seq_snap(&sp->srcu_gp_seq)); + sdp->srcu_cblist_invoking = false; + more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); + spin_unlock_irq(&sdp->lock); + if (more) + srcu_schedule_cbs_sdp(sdp, 0); +} + +/* + * Finished one round of SRCU grace period. Start another if there are + * more SRCU callbacks queued, otherwise put SRCU into not-running state. + */ +static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) +{ + bool pushgp = true; + + spin_lock_irq(&sp->gp_lock); + if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { + /* All requests fulfilled, time to go idle. */ + pushgp = false; + } + } else if (!rcu_seq_state(sp->srcu_gp_seq)) { + /* Outstanding request and no GP. Start one. */ + srcu_gp_start(sp); + } + spin_unlock_irq(&sp->gp_lock); + + if (pushgp) + queue_delayed_work(system_power_efficient_wq, &sp->work, delay); +} + +/* + * This is the work-queue function that handles SRCU grace periods. + */ +void process_srcu(struct work_struct *work) +{ + struct srcu_struct *sp; + + sp = container_of(work, struct srcu_struct, work.work); + + srcu_advance_state(sp); + srcu_reschedule(sp, srcu_get_delay(sp)); +} +EXPORT_SYMBOL_GPL(process_srcu); + +void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = rcu_seq_ctr(sp->srcu_gp_seq); + *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); +} +EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index b23a4d076f3d..e5385731e391 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -25,7 +25,7 @@ #include <linux/completion.h> #include <linux/interrupt.h> #include <linux/notifier.h> -#include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/mutex.h> @@ -41,14 +41,24 @@ /* Forward declarations for tiny_plugin.h. */ struct rcu_ctrlblk; -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); -static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, rcu_callback_t func, struct rcu_ctrlblk *rcp); #include "tiny_plugin.h" +void rcu_barrier_bh(void) +{ + wait_rcu_gp(call_rcu_bh); +} +EXPORT_SYMBOL(rcu_barrier_bh); + +void rcu_barrier_sched(void) +{ + wait_rcu_gp(call_rcu_sched); +} +EXPORT_SYMBOL(rcu_barrier_sched); + #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) /* @@ -69,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching); */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - RCU_TRACE(reset_cpu_stall_ticks(rcp)); + RCU_TRACE(reset_cpu_stall_ticks(rcp);) if (rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; return 1; @@ -115,7 +125,7 @@ void rcu_bh_qs(void) */ void rcu_check_callbacks(int user) { - RCU_TRACE(check_cpu_stalls()); + RCU_TRACE(check_cpu_stalls();) if (user) rcu_sched_qs(); else if (!in_softirq()) @@ -133,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; - RCU_TRACE(int cb_count = 0); + RCU_TRACE(int cb_count = 0;) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); @@ -142,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); return; } - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -152,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ - RCU_TRACE(rn = rcp->name); + RCU_TRACE(rn = rcp->name;) while (list) { next = list->next; prefetch(next); @@ -161,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) __rcu_reclaim(rn, list); local_bh_enable(); list = next; - RCU_TRACE(cb_count++); + RCU_TRACE(cb_count++;) } - RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); + RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), is_idle_task(current), @@ -211,7 +221,7 @@ static void __call_rcu(struct rcu_head *head, local_irq_save(flags); *rcp->curtail = head; rcp->curtail = &head->next; - RCU_TRACE(rcp->qlen++); + RCU_TRACE(rcp->qlen++;) local_irq_restore(flags); if (unlikely(is_idle_task(current))) { @@ -244,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) + RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index c64b827ecbca..371034e77f87 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { RCU_TRACE(.name = "rcu_bh") }; -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) #include <linux/kernel_stat.h> int rcu_scheduler_active __read_mostly; @@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. * The reason for this is that Tiny RCU does not need kthreads, so does * not have to care about the fact that the scheduler is half-initialized - * at a certain phase of the boot process. + * at a certain phase of the boot process. Unless SRCU is in the mix. */ void __init rcu_scheduler_starting(void) { WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) + ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; } -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ #ifdef CONFIG_RCU_TRACE @@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) static void check_cpu_stalls(void) { - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) } #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb4e2056ccf3..e354e475e645 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -32,9 +32,10 @@ #include <linux/init.h> #include <linux/spinlock.h> #include <linux/smp.h> -#include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/interrupt.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/nmi.h> #include <linux/atomic.h> #include <linux/bitops.h> @@ -49,12 +50,14 @@ #include <linux/kernel_stat.h> #include <linux/wait.h> #include <linux/kthread.h> +#include <uapi/linux/sched/types.h> #include <linux/prefetch.h> #include <linux/delay.h> #include <linux/stop_machine.h> #include <linux/random.h> #include <linux/trace_events.h> #include <linux/suspend.h> +#include <linux/ftrace.h> #include "tree.h" #include "rcu.h" @@ -95,8 +98,8 @@ struct rcu_state sname##_state = { \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ - .orphan_nxttail = &sname##_state.orphan_nxtlist, \ - .orphan_donetail = &sname##_state.orphan_donelist, \ + .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ + .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -121,7 +124,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF; module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ -static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; +int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* panic() on RCU Stall sysctl. */ int sysctl_panic_on_rcu_stall __read_mostly; @@ -197,7 +200,7 @@ static const int gp_cleanup_delay; /* * Number of grace periods between delays, normalized by the duration of - * the delay. The longer the the delay, the more the grace periods between + * the delay. The longer the delay, the more the grace periods between * each delay. The reason for this normalization is that it means that, * for non-zero delays, the overall slowdown of grace periods is constant * regardless of the duration of the delay. This arrangement balances @@ -270,19 +273,183 @@ void rcu_bh_qs(void) } } -static DEFINE_PER_CPU(int, rcu_sched_qs_mask); +/* + * Steal a bit from the bottom of ->dynticks for idle entry/exit + * control. Initially this is for TLB flushing. + */ +#define RCU_DYNTICK_CTRL_MASK 0x1 +#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) +#ifndef rcu_eqs_special_exit +#define rcu_eqs_special_exit() do { } while (0) +#endif static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, - .dynticks = ATOMIC_INIT(1), + .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), #ifdef CONFIG_NO_HZ_FULL_SYSIDLE .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, .dynticks_idle = ATOMIC_INIT(1), #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; -DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); -EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); +/* + * There's a few places, currently just in the tracing infrastructure, + * that uses rcu_irq_enter() to make sure RCU is watching. But there's + * a small location where that will not even work. In those cases + * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter() + * can be called. + */ +static DEFINE_PER_CPU(bool, disable_rcu_irq_enter); + +bool rcu_irq_enter_disabled(void) +{ + return this_cpu_read(disable_rcu_irq_enter); +} + +/* + * Record entry into an extended quiescent state. This is only to be + * called when not already in an extended quiescent state. + */ +static void rcu_dynticks_eqs_enter(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int seq; + + /* + * CPUs seeing atomic_add_return() must see prior RCU read-side + * critical sections, and we also must force ordering with the + * next idle sojourn. + */ + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + /* Better be in an extended quiescent state! */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (seq & RCU_DYNTICK_CTRL_CTR)); + /* Better not have special action (TLB flush) pending! */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (seq & RCU_DYNTICK_CTRL_MASK)); +} + +/* + * Record exit from an extended quiescent state. This is only to be + * called from an extended quiescent state. + */ +static void rcu_dynticks_eqs_exit(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int seq; + + /* + * CPUs seeing atomic_add_return() must see prior idle sojourns, + * and we also must force ordering with the next RCU read-side + * critical section. + */ + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !(seq & RCU_DYNTICK_CTRL_CTR)); + if (seq & RCU_DYNTICK_CTRL_MASK) { + atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks); + smp_mb__after_atomic(); /* _exit after clearing mask. */ + /* Prefer duplicate flushes to losing a flush. */ + rcu_eqs_special_exit(); + } +} + +/* + * Reset the current CPU's ->dynticks counter to indicate that the + * newly onlined CPU is no longer in an extended quiescent state. + * This will either leave the counter unchanged, or increment it + * to the next non-quiescent value. + * + * The non-atomic test/increment sequence works because the upper bits + * of the ->dynticks counter are manipulated only by the corresponding CPU, + * or when the corresponding CPU is offline. + */ +static void rcu_dynticks_eqs_online(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR) + return; + atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); +} + +/* + * Is the current CPU in an extended quiescent state? + * + * No ordering, as we are sampling CPU-local information. + */ +bool rcu_dynticks_curr_cpu_in_eqs(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR); +} + +/* + * Snapshot the ->dynticks counter with full ordering so as to allow + * stable comparison of this counter with past and future snapshots. + */ +int rcu_dynticks_snap(struct rcu_dynticks *rdtp) +{ + int snap = atomic_add_return(0, &rdtp->dynticks); + + return snap & ~RCU_DYNTICK_CTRL_MASK; +} + +/* + * Return true if the snapshot returned from rcu_dynticks_snap() + * indicates that RCU is in an extended quiescent state. + */ +static bool rcu_dynticks_in_eqs(int snap) +{ + return !(snap & RCU_DYNTICK_CTRL_CTR); +} + +/* + * Return true if the CPU corresponding to the specified rcu_dynticks + * structure has spent some time in an extended quiescent state since + * rcu_dynticks_snap() returned the specified snapshot. + */ +static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) +{ + return snap != rcu_dynticks_snap(rdtp); +} + +/* + * Do a double-increment of the ->dynticks counter to emulate a + * momentary idle-CPU quiescent state. + */ +static void rcu_dynticks_momentary_idle(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, + &rdtp->dynticks); + + /* It is illegal to call this from idle state. */ + WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); +} + +/* + * Set the special (bottom) bit of the specified CPU so that it + * will take special action (such as flushing its TLB) on the + * next exit from an extended quiescent state. Returns true if + * the bit was successfully set, or false if the CPU was not in + * an extended quiescent state. + */ +bool rcu_eqs_special_set(int cpu) +{ + int old; + int new; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + do { + old = atomic_read(&rdtp->dynticks); + if (old & RCU_DYNTICK_CTRL_CTR) + return false; + new = old | RCU_DYNTICK_CTRL_MASK; + } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old); + return true; +} /* * Let the RCU core know that this CPU has gone through the scheduler, @@ -291,48 +458,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * memory barriers to let the RCU core know about it, regardless of what * this CPU might (or might not) do in the near future. * - * We inform the RCU core by emulating a zero-duration dyntick-idle - * period, which we in turn do by incrementing the ->dynticks counter - * by two. + * We inform the RCU core by emulating a zero-duration dyntick-idle period. * * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - struct rcu_data *rdp; - struct rcu_dynticks *rdtp; - int resched_mask; - struct rcu_state *rsp; - - /* - * Yes, we can lose flag-setting operations. This is OK, because - * the flag will be set again after some delay. - */ - resched_mask = raw_cpu_read(rcu_sched_qs_mask); - raw_cpu_write(rcu_sched_qs_mask, 0); - - /* Find the flavor that needs a quiescent state. */ - for_each_rcu_flavor(rsp) { - rdp = raw_cpu_ptr(rsp->rda); - if (!(resched_mask & rsp->flavor_mask)) - continue; - smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ - if (READ_ONCE(rdp->mynode->completed) != - READ_ONCE(rdp->cond_resched_completed)) - continue; - - /* - * Pretend to be momentarily idle for the quiescent state. - * This allows the grace-period kthread to record the - * quiescent state, with no need for this CPU to do anything - * further. - */ - rdtp = this_cpu_ptr(&rcu_dynticks); - smp_mb__before_atomic(); /* Earlier stuff before QS. */ - atomic_add(2, &rdtp->dynticks); /* QS. */ - smp_mb__after_atomic(); /* Later stuff after QS. */ - break; - } + raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); + rcu_dynticks_momentary_idle(); } /* @@ -340,14 +473,22 @@ static void rcu_momentary_dyntick_idle(void) * and requires special handling for preemptible RCU. * The caller must have disabled interrupts. */ -void rcu_note_context_switch(void) +void rcu_note_context_switch(bool preempt) { barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); rcu_preempt_note_context_switch(); - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) + goto out; + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); + if (!preempt) + rcu_note_voluntary_context_switch_lite(current); +out: trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -370,29 +511,26 @@ void rcu_all_qs(void) { unsigned long flags; + if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) + return; + preempt_disable(); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { + preempt_enable(); + return; + } + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); } - if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { - /* - * Yes, we just checked a per-CPU variable with preemption - * enabled, so we might be migrated to some other CPU at - * this point. That is OK because in that case, the - * migration will supply the needed quiescent state. - * We might end up needlessly disabling preemption and - * invoking rcu_sched_qs() on the destination CPU, but - * the probability and cost are both quite low, so this - * should not be a problem in practice. - */ - preempt_disable(); + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) rcu_sched_qs(); - preempt_enable(); - } - this_cpu_inc(rcu_qs_ctr); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ + preempt_enable(); } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -581,15 +719,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, default: break; } - if (rsp != NULL) { - *flags = READ_ONCE(rsp->gp_flags); - *gpnum = READ_ONCE(rsp->gpnum); - *completed = READ_ONCE(rsp->completed); + if (rsp == NULL) return; - } - *flags = 0; - *gpnum = 0; - *completed = 0; + *flags = READ_ONCE(rsp->gp_flags); + *gpnum = READ_ONCE(rsp->gpnum); + *completed = READ_ONCE(rsp->completed); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); @@ -605,16 +739,6 @@ void rcutorture_record_progress(unsigned long vernum) EXPORT_SYMBOL_GPL(rcutorture_record_progress); /* - * Does the CPU have callbacks ready to be invoked? - */ -static int -cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) -{ - return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && - rdp->nxttail[RCU_DONE_TAIL] != NULL; -} - -/* * Return the root node of the specified rcu_state structure. */ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) @@ -644,44 +768,39 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - int i; - if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) return true; /* Yes, a no-CBs CPU needs one. */ - if (!rdp->nxttail[RCU_NEXT_TAIL]) + if (!rcu_segcblist_is_enabled(&rdp->cblist)) return false; /* No, this is a no-CBs (or offline) CPU. */ - if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) + if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return true; /* Yes, CPU has newly registered callbacks. */ - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rdp->nxttail[i - 1] != rdp->nxttail[i] && - ULONG_CMP_LT(READ_ONCE(rsp->completed), - rdp->nxtcompleted[i])) - return true; /* Yes, CBs for future grace period. */ + if (rcu_segcblist_future_gp_needed(&rdp->cblist, + READ_ONCE(rsp->completed))) + return true; /* Yes, CBs for future grace period. */ return false; /* No grace period needed. */ } /* - * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state + * rcu_eqs_enter_common - current CPU is entering an extended quiescent state * - * If the new value of the ->dynticks_nesting counter now is zero, - * we really have entered idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. + * Enter idle, doing appropriate accounting. The caller must have + * disabled interrupts. */ -static void rcu_eqs_enter_common(long long oldval, bool user) +static void rcu_eqs_enter_common(bool user) { struct rcu_state *rsp; struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); + trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0); rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -692,12 +811,10 @@ static void rcu_eqs_enter_common(long long oldval, bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - atomic_read(&rdtp->dynticks) & 0x1); + __this_cpu_inc(disable_rcu_irq_enter); + rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ + rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ + __this_cpu_dec(disable_rcu_irq_enter); rcu_dynticks_task_enter(); /* @@ -718,19 +835,15 @@ static void rcu_eqs_enter_common(long long oldval, bool user) */ static void rcu_eqs_enter(bool user) { - long long oldval; struct rcu_dynticks *rdtp; rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (oldval & DYNTICK_TASK_NEST_MASK) == 0); - if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { - rdtp->dynticks_nesting = 0; - rcu_eqs_enter_common(oldval, user); - } else { + (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); + if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + rcu_eqs_enter_common(user); + else rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - } } /** @@ -789,19 +902,18 @@ void rcu_user_enter(void) */ void rcu_irq_exit(void) { - long long oldval; struct rcu_dynticks *rdtp; RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting--; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting < 0); - if (rdtp->dynticks_nesting) - trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); - else - rcu_eqs_enter_common(oldval, true); + rdtp->dynticks_nesting < 1); + if (rdtp->dynticks_nesting <= 1) { + rcu_eqs_enter_common(true); + } else { + trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); + rdtp->dynticks_nesting--; + } rcu_sysidle_enter(1); } @@ -826,15 +938,10 @@ void rcu_irq_exit_irqson(void) */ static void rcu_eqs_exit_common(long long oldval, int user) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) rcu_dynticks_task_exit(); - smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !(atomic_read(&rdtp->dynticks) & 0x1)); + rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -980,12 +1087,8 @@ void rcu_nmi_enter(void) * to be in the outermost NMI handler that interrupted an RCU-idle * period (observation due to Andy Lutomirski). */ - if (!(atomic_read(&rdtp->dynticks) & 0x1)) { - smp_mb__before_atomic(); /* Force delay from prior write. */ - atomic_inc(&rdtp->dynticks); - /* atomic_inc() before later RCU read-side crit sects */ - smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + if (rcu_dynticks_curr_cpu_in_eqs()) { + rcu_dynticks_eqs_exit(); incby = 1; } rdtp->dynticks_nmi_nesting += incby; @@ -1010,7 +1113,7 @@ void rcu_nmi_exit(void) * to us!) */ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); /* * If the nesting level is not 1, the CPU wasn't RCU-idle, so @@ -1023,11 +1126,7 @@ void rcu_nmi_exit(void) /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ rdtp->dynticks_nmi_nesting = 0; - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic(); /* Force delay to next write. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rcu_dynticks_eqs_enter(); } /** @@ -1040,7 +1139,7 @@ void rcu_nmi_exit(void) */ bool notrace __rcu_is_watching(void) { - return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; + return !rcu_dynticks_curr_cpu_in_eqs(); } /** @@ -1060,6 +1159,24 @@ bool notrace rcu_is_watching(void) } EXPORT_SYMBOL_GPL(rcu_is_watching); +/* + * If a holdout task is actually running, request an urgent quiescent + * state from its CPU. This is unsynchronized, so migrations can cause + * the request to go to the wrong CPU. Which is OK, all that will happen + * is that the CPU's next context switch will be a bit slower and next + * time around this task will generate another request. + */ +void rcu_request_urgent_qs_task(struct task_struct *t) +{ + int cpu; + + barrier(); + cpu = task_cpu(t); + if (!task_curr(t)) + return; /* This task is not running on that CPU. */ + smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true); +} + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* @@ -1123,9 +1240,9 @@ static int rcu_is_cpu_rrupt_from_idle(void) static int dyntick_save_progress_counter(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { - rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); rcu_sysidle_check_cpu(rdp, isidle, maxj); - if ((rdp->dynticks_snap & 0x1) == 0) { + if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) @@ -1144,12 +1261,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { - unsigned int curr; - int *rcrmp; - unsigned int snap; - - curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); - snap = (unsigned int)rdp->dynticks_snap; + unsigned long jtsq; + bool *rnhqp; + bool *ruqp; + unsigned long rjtsc; + struct rcu_node *rnp; /* * If the CPU passed through or entered a dynticks idle phase with @@ -1159,27 +1275,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * read-side critical section that started before the beginning * of the current RCU grace period. */ - if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { + if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; return 1; } + /* Compute and saturate jiffies_till_sched_qs. */ + jtsq = jiffies_till_sched_qs; + rjtsc = rcu_jiffies_till_stall_check(); + if (jtsq > rjtsc / 2) { + WRITE_ONCE(jiffies_till_sched_qs, rjtsc); + jtsq = rjtsc / 2; + } else if (jtsq < 1) { + WRITE_ONCE(jiffies_till_sched_qs, 1); + jtsq = 1; + } + /* - * Check for the CPU being offline, but only if the grace period - * is old enough. We don't need to worry about the CPU changing - * state: If we see it offline even once, it has been through a - * quiescent state. - * - * The reason for insisting that the grace period be at least - * one jiffy old is that CPUs that are not quite online and that - * have just gone offline can still execute RCU read-side critical - * sections. + * Has this CPU encountered a cond_resched_rcu_qs() since the + * beginning of the grace period? For this to be the case, + * the CPU has to have noticed the current grace period. This + * might not be the case for nohz_full CPUs looping in the kernel. */ - if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) - return 0; /* Grace period is not old enough. */ - barrier(); - if (cpu_is_offline(rdp->cpu)) { + rnp = rdp->mynode; + ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); + if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && + READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && + READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + return 1; + } else { + /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ + smp_store_release(ruqp, true); + } + + /* Check for the CPU being offline. */ + if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; return 1; @@ -1192,7 +1324,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * in-kernel CPU-bound tasks cannot advance grace periods. * So if the grace period is old enough, make the CPU pay attention. * Note that the unsynchronized assignments to the per-CPU - * rcu_sched_qs_mask variable are safe. Yes, setting of + * rcu_need_heavy_qs variable are safe. Yes, setting of * bits can be lost, but they will be set again on the next * force-quiescent-state pass. So lost bit sets do not result * in incorrect behavior, merely in a grace period lasting @@ -1206,25 +1338,22 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * is set too high, we override with half of the RCU CPU stall * warning delay. */ - rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); - if (ULONG_CMP_GE(jiffies, - rdp->rsp->gp_start + jiffies_till_sched_qs) || - ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { - WRITE_ONCE(rdp->cond_resched_completed, - READ_ONCE(rdp->mynode->completed)); - smp_mb(); /* ->cond_resched_completed before *rcrmp. */ - WRITE_ONCE(*rcrmp, - READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - } + rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); + if (!READ_ONCE(*rnhqp) && + (time_after(jiffies, rdp->rsp->gp_start + jtsq) || + time_after(jiffies, rdp->rsp->jiffies_resched))) { + WRITE_ONCE(*rnhqp, true); + /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ + smp_store_release(ruqp, true); rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } - /* And if it has been a really long time, kick the CPU as well. */ - if (ULONG_CMP_GE(jiffies, - rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) || - ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs)) - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + /* + * If more than halfway to RCU CPU stall-warning time, do + * a resched_cpu() to try to loosen things up a bit. + */ + if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) + resched_cpu(rdp->cpu); return 0; } @@ -1277,7 +1406,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) } /* - * Dump stacks of all tasks running on stalled CPUs. + * Dump stacks of all tasks running on stalled CPUs. First try using + * NMIs, but fall back to manual remote stack tracing on architectures + * that don't support NMI-based stack dumps. The NMI-triggered stack + * traces are more accurate because they are printed by the target CPU. */ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) { @@ -1287,11 +1419,10 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->qsmask != 0) { - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + if (!trigger_single_cpu_backtrace(cpu)) dump_cpu_task(cpu); - } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1373,12 +1504,16 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + cpu)->cblist); pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), (long)rsp->gpnum, (long)rsp->completed, totqlen); if (ndetected) { rcu_dump_cpu_stacks(rsp); + + /* Complain about tasks blocking the grace period. */ + rcu_print_detail_task_stall(rsp); } else { if (READ_ONCE(rsp->gpnum) != gpnum || READ_ONCE(rsp->completed) == gpnum) { @@ -1395,9 +1530,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) } } - /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(rsp); - rcu_check_gp_kthread_starvation(rsp); panic_on_rcu_stall(); @@ -1427,7 +1559,8 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + cpu)->cblist); pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); @@ -1530,30 +1663,6 @@ void rcu_cpu_stall_reset(void) } /* - * Initialize the specified rcu_data structure's default callback list - * to empty. The default callback list is the one that is not used by - * no-callbacks CPUs. - */ -static void init_default_callback_list(struct rcu_data *rdp) -{ - int i; - - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; -} - -/* - * Initialize the specified rcu_data structure's callback list to empty. - */ -static void init_callback_list(struct rcu_data *rdp) -{ - if (init_nocb_callback_list(rdp)) - return; - init_default_callback_list(rdp); -} - -/* * Determine the value that ->completed will have at the end of the * next subsequent grace period. This is used to tag callbacks so that * a CPU can invoke callbacks in a timely fashion even if that CPU has @@ -1607,7 +1716,6 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long *c_out) { unsigned long c; - int i; bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); @@ -1653,13 +1761,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Get a new grace-period number. If there really is no grace * period in progress, it will be smaller than the one we obtained - * earlier. Adjust callbacks as needed. Note that even no-CBs - * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. + * earlier. Adjust callbacks as needed. */ c = rcu_cbs_completed(rdp->rsp, rnp_root); - for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) - if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) - rdp->nxtcompleted[i] = c; + if (!rcu_is_nocb_cpu(rdp->cpu)) + (void)rcu_segcblist_accelerate(&rdp->cblist, c); /* * If the needed for the required grace period is already @@ -1691,9 +1797,7 @@ out: /* * Clean up any old requests for the just-ended grace period. Also return - * whether any additional grace periods have been requested. Also invoke - * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads - * waiting for this grace period to complete. + * whether any additional grace periods have been requested. */ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { @@ -1739,57 +1843,27 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - unsigned long c; - int i; - bool ret; - - /* If the CPU has no callbacks, nothing to do. */ - if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return false; - - /* - * Starting from the sublist containing the callbacks most - * recently assigned a ->completed number and working down, find the - * first sublist that is not assignable to an upcoming grace period. - * Such a sublist has something in it (first two tests) and has - * a ->completed number assigned that will complete sooner than - * the ->completed number for newly arrived callbacks (last test). - * - * The key point is that any later sublist can be assigned the - * same ->completed number as the newly arrived callbacks, which - * means that the callbacks in any of these later sublist can be - * grouped into a single sublist, whether or not they have already - * been assigned a ->completed number. - */ - c = rcu_cbs_completed(rsp, rnp); - for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) - if (rdp->nxttail[i] != rdp->nxttail[i - 1] && - !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) - break; + bool ret = false; - /* - * If there are no sublist for unassigned callbacks, leave. - * At the same time, advance "i" one sublist, so that "i" will - * index into the sublist where all the remaining callbacks should - * be grouped into. - */ - if (++i >= RCU_NEXT_TAIL) + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; /* - * Assign all subsequent callbacks' ->completed number to the next - * full grace period and group them all in the sublist initially - * indexed by "i". + * Callbacks are often registered with incomplete grace-period + * information. Something about the fact that getting exact + * information requires acquiring a global lock... RCU therefore + * makes a conservative estimate of the grace period number at which + * a given callback will become ready to invoke. The following + * code checks this estimate and improves it when possible, thus + * accelerating callback invocation to an earlier grace-period + * number. */ - for (; i <= RCU_NEXT_TAIL; i++) { - rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxtcompleted[i] = c; - } - /* Record any needed additional grace periods. */ - ret = rcu_start_future_gp(rnp, rdp, NULL); + if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) + ret = rcu_start_future_gp(rnp, rdp, NULL); /* Trace depending on how much we were able to accelerate. */ - if (!*rdp->nxttail[RCU_WAIT_TAIL]) + if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); else trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); @@ -1809,32 +1883,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i, j; - - /* If the CPU has no callbacks, nothing to do. */ - if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; /* * Find all callbacks whose ->completed numbers indicate that they * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. */ - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { - if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) - break; - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; - } - /* Clean up any sublist tail pointers that were misordered above. */ - for (j = RCU_WAIT_TAIL; j < i; j++) - rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; - - /* Copy down callbacks to fill in empty sublists. */ - for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { - if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) - break; - rdp->nxttail[j] = rdp->nxttail[i]; - rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; - } + rcu_segcblist_advance(&rdp->cblist, rnp->completed); /* Classify any remaining callbacks. */ return rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1879,7 +1936,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); @@ -2467,10 +2524,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - if ((rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || - rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || - rdp->gpwrap) { + if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum || + rnp->completed == rnp->gpnum || rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2479,7 +2534,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -2525,8 +2580,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) + if (rdp->cpu_no_qs.b.norm) return; /* @@ -2554,13 +2608,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * because _rcu_barrier() excludes CPU-hotplug operations, so it * cannot be running now. Thus no memory barrier is required. */ - if (rdp->nxtlist != NULL) { - rsp->qlen_lazy += rdp->qlen_lazy; - rsp->qlen += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; - rdp->qlen_lazy = 0; - WRITE_ONCE(rdp->qlen, 0); - } + rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); + rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); /* * Next, move those callbacks still needing a grace period to @@ -2568,31 +2617,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * Some of the callbacks might have gone partway through a grace * period, but that is too bad. They get to start over because we * cannot assume that grace periods are synchronized across CPUs. - * We don't bother updating the ->nxttail[] array yet, instead - * we just reset the whole thing later on. */ - if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { - *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; - rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = NULL; - } + rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); /* * Then move the ready-to-invoke callbacks to the orphanage, * where some other CPU will pick them up. These will not be * required to pass though another grace period: They are done. */ - if (rdp->nxtlist != NULL) { - *rsp->orphan_donetail = rdp->nxtlist; - rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; - } + rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - /* - * Finally, initialize the rcu_data structure's list to empty and - * disallow further callbacks on this CPU. - */ - init_callback_list(rdp); - rdp->nxttail[RCU_NEXT_TAIL] = NULL; + /* Finally, disallow further callbacks on this CPU. */ + rcu_segcblist_disable(&rdp->cblist); } /* @@ -2601,7 +2637,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, */ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { - int i; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ @@ -2610,13 +2645,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) return; /* Do the accounting first. */ - rdp->qlen_lazy += rsp->qlen_lazy; - rdp->qlen += rsp->qlen; - rdp->n_cbs_adopted += rsp->qlen; - if (rsp->qlen_lazy != rsp->qlen) + rdp->n_cbs_adopted += rsp->orphan_done.len; + if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) rcu_idle_count_callbacks_posted(); - rsp->qlen_lazy = 0; - rsp->qlen = 0; + rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); /* * We do not need a memory barrier here because the only way we @@ -2624,24 +2656,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) * we are the task doing the rcu_barrier(). */ - /* First adopt the ready-to-invoke callbacks. */ - if (rsp->orphan_donelist != NULL) { - *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; - for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) - if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[i] = rsp->orphan_donetail; - rsp->orphan_donelist = NULL; - rsp->orphan_donetail = &rsp->orphan_donelist; - } - - /* And then adopt the callbacks that still need a grace period. */ - if (rsp->orphan_nxtlist != NULL) { - *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; - rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; - rsp->orphan_nxtlist = NULL; - rsp->orphan_nxttail = &rsp->orphan_nxtlist; - } + /* First adopt the ready-to-invoke callbacks, then the done ones. */ + rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); + WARN_ON_ONCE(rsp->orphan_done.head); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); + WARN_ON_ONCE(rsp->orphan_pend.head); + WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != + !rcu_segcblist_n_cbs(&rdp->cblist)); } /* @@ -2649,14 +2670,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - RCU_TRACE(unsigned long mask); - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); - RCU_TRACE(struct rcu_node *rnp = rdp->mynode); + RCU_TRACE(unsigned long mask;) + RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) + RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return; - RCU_TRACE(mask = rdp->grpmask); + RCU_TRACE(mask = rdp->grpmask;) trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), TPS("cpuofl")); @@ -2729,9 +2750,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, - "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", - cpu, rdp->qlen, rdp->nxtlist); + WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || + !rcu_segcblist_empty(&rdp->cblist), + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", + cpu, rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_first_cb(&rdp->cblist)); } /* @@ -2741,14 +2764,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; - struct rcu_head *next, *list, **tail; - long bl, count, count_lazy; - int i; + struct rcu_head *rhp; + struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); + long bl, count; /* If no callbacks are ready, just return. */ - if (!cpu_has_callbacks_ready_to_invoke(rdp)) { - trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); - trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), + if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { + trace_rcu_batch_start(rsp->name, + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), 0); + trace_rcu_batch_end(rsp->name, 0, + !rcu_segcblist_empty(&rdp->cblist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); return; @@ -2756,73 +2782,61 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* * Extract the list of ready callbacks, disabling to prevent - * races with call_rcu() from interrupt handlers. + * races with call_rcu() from interrupt handlers. Leave the + * callback counts, as rcu_barrier() needs to be conservative. */ local_irq_save(flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); - list = rdp->nxtlist; - rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = NULL; - tail = rdp->nxttail[RCU_DONE_TAIL]; - for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) - if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[i] = &rdp->nxtlist; + trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), bl); + rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); local_irq_restore(flags); /* Invoke callbacks. */ - count = count_lazy = 0; - while (list) { - next = list->next; - prefetch(next); - debug_rcu_head_unqueue(list); - if (__rcu_reclaim(rsp->name, list)) - count_lazy++; - list = next; - /* Stop only if limit reached and CPU has something to do. */ - if (++count >= bl && + rhp = rcu_cblist_dequeue(&rcl); + for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { + debug_rcu_head_unqueue(rhp); + if (__rcu_reclaim(rsp->name, rhp)) + rcu_cblist_dequeued_lazy(&rcl); + /* + * Stop only if limit reached and CPU has something to do. + * Note: The rcl structure counts down from zero. + */ + if (-rcl.len >= bl && (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; } local_irq_save(flags); - trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), - is_idle_task(current), - rcu_is_callbacks_kthread()); - - /* Update count, and requeue any remaining callbacks. */ - if (list != NULL) { - *tail = rdp->nxtlist; - rdp->nxtlist = list; - for (i = 0; i < RCU_NEXT_SIZE; i++) - if (&rdp->nxtlist == rdp->nxttail[i]) - rdp->nxttail[i] = tail; - else - break; - } + count = -rcl.len; + trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(), + is_idle_task(current), rcu_is_callbacks_kthread()); + + /* Update counts and requeue any remaining callbacks. */ + rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); smp_mb(); /* List handling before counting for rcu_barrier(). */ - rdp->qlen_lazy -= count_lazy; - WRITE_ONCE(rdp->qlen, rdp->qlen - count); rdp->n_cbs_invoked += count; + rcu_segcblist_insert_count(&rdp->cblist, &rcl); /* Reinstate batch limit if we have worked down the excess. */ - if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) + count = rcu_segcblist_n_cbs(&rdp->cblist); + if (rdp->blimit == LONG_MAX && count <= qlowmark) rdp->blimit = blimit; /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ - if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + if (count == 0 && rdp->qlen_last_fqs_check != 0) { rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; - } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) - rdp->qlen_last_fqs_check = rdp->qlen; - WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); + } else if (count < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = count; + WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); local_irq_restore(flags); /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_core(); } @@ -2988,7 +3002,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - WARN_ON_ONCE(rdp->beenonline == 0); + WARN_ON_ONCE(!rdp->beenonline); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); @@ -3006,7 +3020,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) } /* If there are callbacks ready, invoke them. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_callbacks(rsp, rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ @@ -3078,7 +3092,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, * invoking force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ - if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > + rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ note_gp_changes(rsp, rdp); @@ -3096,10 +3111,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) + rcu_segcblist_first_pend_cb(&rdp->cblist) != head) force_quiescent_state(rsp); rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; + rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } } } @@ -3139,7 +3154,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { + if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { int offline; if (cpu != -1) @@ -3158,23 +3173,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, */ BUG_ON(cpu != -1); WARN_ON_ONCE(!rcu_is_watching()); - if (!likely(rdp->nxtlist)) - init_default_callback_list(rdp); + if (rcu_segcblist_empty(&rdp->cblist)) + rcu_segcblist_init(&rdp->cblist); } - WRITE_ONCE(rdp->qlen, rdp->qlen + 1); - if (lazy) - rdp->qlen_lazy++; - else + rcu_segcblist_enqueue(&rdp->cblist, head, lazy); + if (!lazy) rcu_idle_count_callbacks_posted(); - smp_mb(); /* Count before adding callback for rcu_barrier(). */ - *rdp->nxttail[RCU_NEXT_TAIL] = head; - rdp->nxttail[RCU_NEXT_TAIL] = &head->next; if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, - rdp->qlen_lazy, rdp->qlen); + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist)); else - trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); + trace_rcu_callback(rsp->name, head, + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ __call_rcu_core(rsp, rdp, head, flags); @@ -3420,41 +3433,6 @@ void cond_synchronize_sched(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_sched); -/* Adjust sequence number for start of update-side operation. */ -static void rcu_seq_start(unsigned long *sp) -{ - WRITE_ONCE(*sp, *sp + 1); - smp_mb(); /* Ensure update-side operation after counter increment. */ - WARN_ON_ONCE(!(*sp & 0x1)); -} - -/* Adjust sequence number for end of update-side operation. */ -static void rcu_seq_end(unsigned long *sp) -{ - smp_mb(); /* Ensure update-side operation before counter increment. */ - WRITE_ONCE(*sp, *sp + 1); - WARN_ON_ONCE(*sp & 0x1); -} - -/* Take a snapshot of the update side's sequence number. */ -static unsigned long rcu_seq_snap(unsigned long *sp) -{ - unsigned long s; - - s = (READ_ONCE(*sp) + 3) & ~0x1; - smp_mb(); /* Above access must not bleed into critical section. */ - return s; -} - -/* - * Given a snapshot from rcu_seq_snap(), determine whether or not a - * full update-side operation has occurred. - */ -static bool rcu_seq_done(unsigned long *sp, unsigned long s) -{ - return ULONG_CMP_GE(READ_ONCE(*sp), s); -} - /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -3478,17 +3456,15 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { rdp->n_rp_core_needs_qs++; - } else if (rdp->core_needs_qs && - (!rdp->cpu_no_qs.b.norm || - rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { + } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { rdp->n_rp_report_qs++; return 1; } /* Does this CPU have callbacks ready to invoke? */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) { + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { rdp->n_rp_cb_ready++; return 1; } @@ -3552,10 +3528,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (!rdp->nxtlist) + if (rcu_segcblist_empty(&rdp->cblist)) continue; hc = true; - if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { + if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) { al = false; break; } @@ -3664,7 +3640,7 @@ static void _rcu_barrier(struct rcu_state *rsp) __call_rcu(&rdp->barrier_head, rcu_barrier_callback, rsp, cpu, 0); } - } else if (READ_ONCE(rdp->qlen)) { + } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); @@ -3748,7 +3724,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); - WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); + WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); @@ -3765,7 +3741,6 @@ static void rcu_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); @@ -3774,12 +3749,12 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - if (!rdp->nxtlist) - init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ + if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ + !init_nocb_callback_list(rdp)) + rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_sysidle_init_percpu_data(rdp->dynticks); - atomic_set(&rdp->dynticks->dynticks, - (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); + rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* @@ -3788,7 +3763,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) * of the next grace period. */ rnp = rdp->mynode; - mask = rdp->grpmask; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ if (!rdp->beenonline) WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); @@ -3796,12 +3770,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; rdp->cpu_no_qs.b.norm = true; - rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); + rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * Invoked early in the CPU-online process, when pretty much all + * services are available. The incoming CPU is not present. + */ int rcutree_prepare_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3815,6 +3793,9 @@ int rcutree_prepare_cpu(unsigned int cpu) return 0; } +/* + * Update RCU priority boot kthread affinity for CPU-hotplug changes. + */ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) { struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); @@ -3822,20 +3803,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); } +/* + * Near the end of the CPU-online process. Pretty much all services + * enabled, and the CPU is now very much alive. + */ int rcutree_online_cpu(unsigned int cpu) { sync_sched_exp_online_cleanup(cpu); rcutree_affinity_setting(cpu, -1); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_online_cpu(cpu); return 0; } +/* + * Near the beginning of the process. The CPU is still very much alive + * with pretty much all services enabled. + */ int rcutree_offline_cpu(unsigned int cpu) { rcutree_affinity_setting(cpu, cpu); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_offline_cpu(cpu); return 0; } - +/* + * Near the end of the offline process. We do only tracing here. + */ int rcutree_dying_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3845,6 +3840,9 @@ int rcutree_dying_cpu(unsigned int cpu) return 0; } +/* + * The outgoing CPU is gone and we are running elsewhere. + */ int rcutree_dead_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3862,6 +3860,10 @@ int rcutree_dead_cpu(unsigned int cpu) * incoming CPUs are not allowed to use RCU read-side critical sections * until this function is called. Failing to observe this restriction * will result in lockdep splats. + * + * Note that this function is special in that it is invoked directly + * from the incoming CPU rather than from the cpuhp_step mechanism. + * This is because this function must be invoked at a precise location. */ void rcu_cpu_starting(unsigned int cpu) { @@ -3872,7 +3874,7 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_state *rsp; for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); + rdp = per_cpu_ptr(rsp->rda, cpu); rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -3887,9 +3889,6 @@ void rcu_cpu_starting(unsigned int cpu) * The CPU is exiting the idle loop into the arch_cpu_idle_dead() * function. We now remove it from the rcu_node tree's ->qsmaskinit * bit masks. - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit - * bit masks. */ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) { @@ -3905,6 +3904,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * The outgoing function has no further need of RCU, so remove it from + * the list of CPUs that RCU must track. + * + * Note that this function is special in that it is invoked directly + * from the outgoing CPU rather than from the cpuhp_step mechanism. + * This is because this function must be invoked at a precise location. + */ void rcu_report_dead(unsigned int cpu) { struct rcu_state *rsp; @@ -3919,6 +3926,10 @@ void rcu_report_dead(unsigned int cpu) } #endif +/* + * On non-huge systems, use expedited RCU grace periods to make suspend + * and hibernation run faster. + */ static int rcu_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -3989,7 +4000,7 @@ early_initcall(rcu_spawn_gp_kthread); * task is booting the system, and such primitives are no-ops). After this * function is called, any synchronous grace-period primitives are run as * expedited, with the requesting task driving the grace period forward. - * A later core_initcall() rcu_exp_runtime_mode() will switch to full + * A later core_initcall() rcu_set_runtime_mode() will switch to full * runtime RCU functionality. */ void rcu_scheduler_starting(void) @@ -4002,31 +4013,6 @@ void rcu_scheduler_starting(void) } /* - * Compute the per-level fanout, either using the exact fanout specified - * or balancing the tree, depending on the rcu_fanout_exact boot parameter. - */ -static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) -{ - int i; - - if (rcu_fanout_exact) { - levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; - for (i = rcu_num_lvls - 2; i >= 0; i--) - levelspread[i] = RCU_FANOUT; - } else { - int ccur; - int cprv; - - cprv = nr_cpu_ids; - for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = levelcnt[i]; - levelspread[i] = (cprv + ccur - 1) / ccur; - cprv = ccur; - } - } -} - -/* * Helper function for rcu_init() that initializes one rcu_state structure. */ static void __init rcu_init_one(struct rcu_state *rsp) @@ -4035,9 +4021,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) static const char * const fqs[] = RCU_FQS_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static u8 fl_mask = 0x1; - int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ int cpustride = 1; int i; @@ -4052,20 +4036,16 @@ static void __init rcu_init_one(struct rcu_state *rsp) /* Initialize the level-tracking arrays. */ - for (i = 0; i < rcu_num_lvls; i++) - levelcnt[i] = num_rcu_lvl[i]; for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; - rcu_init_levelspread(levelspread, levelcnt); - rsp->flavor_mask = fl_mask; - fl_mask <<= 1; + rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1]; + rcu_init_levelspread(levelspread, num_rcu_lvl); /* Initialize the elements themselves, starting from the leaves. */ for (i = rcu_num_lvls - 1; i >= 0; i--) { cpustride *= levelspread[i]; rnp = rsp->level[i]; - for (j = 0; j < levelcnt[i]; j++, rnp++) { + for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) { raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), &rcu_node_class[i], buf[i]); @@ -4238,6 +4218,8 @@ void __init rcu_init(void) for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); rcu_cpu_starting(cpu); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_online_cpu(cpu); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fe98dd24adf8..ba38262c3554 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -24,85 +24,15 @@ #include <linux/cache.h> #include <linux/spinlock.h> +#include <linux/rtmutex.h> #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> #include <linux/swait.h> #include <linux/stop_machine.h> +#include <linux/rcu_node_tree.h> -/* - * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and - * CONFIG_RCU_FANOUT_LEAF. - * In theory, it should be possible to add more levels straightforwardly. - * In practice, this did work well going from three levels to four. - * Of course, your mileage may vary. - */ - -#ifdef CONFIG_RCU_FANOUT -#define RCU_FANOUT CONFIG_RCU_FANOUT -#else /* #ifdef CONFIG_RCU_FANOUT */ -# ifdef CONFIG_64BIT -# define RCU_FANOUT 64 -# else -# define RCU_FANOUT 32 -# endif -#endif /* #else #ifdef CONFIG_RCU_FANOUT */ - -#ifdef CONFIG_RCU_FANOUT_LEAF -#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF -#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ -# ifdef CONFIG_64BIT -# define RCU_FANOUT_LEAF 64 -# else -# define RCU_FANOUT_LEAF 32 -# endif -#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ - -#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) -#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) -#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) -#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) - -#if NR_CPUS <= RCU_FANOUT_1 -# define RCU_NUM_LVLS 1 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_NODES NUM_RCU_LVL_0 -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } -# define RCU_NODE_NAME_INIT { "rcu_node_0" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -#elif NR_CPUS <= RCU_FANOUT_2 -# define RCU_NUM_LVLS 2 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -#elif NR_CPUS <= RCU_FANOUT_3 -# define RCU_NUM_LVLS 3 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -#elif NR_CPUS <= RCU_FANOUT_4 -# define RCU_NUM_LVLS 4 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -#else -# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" -#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ - -extern int rcu_num_lvls; -extern int rcu_num_nodes; +#include "rcu_segcblist.h" /* * Dynticks per-CPU state. @@ -112,6 +42,9 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ + bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ + unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ + bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; /* irq/process nesting level from idle. */ @@ -261,41 +194,6 @@ struct rcu_node { #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) /* - * Do a full breadth-first scan of the rcu_node structures for the - * specified rcu_state structure. - */ -#define rcu_for_each_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* - * Do a breadth-first scan of the non-leaf rcu_node structures for the - * specified rcu_state structure. Note that if there is a singleton - * rcu_node tree with but one rcu_node structure, this loop is a no-op. - */ -#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) - -/* - * Scan the leaves of the rcu_node hierarchy for the specified rcu_state - * structure. Note that if there is a singleton rcu_node tree with but - * one rcu_node structure, this loop -will- visit the rcu_node structure. - * It is still a leaf node, even if it is also the root node. - */ -#define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* - * Iterate over all possible CPUs in a leaf RCU node. - */ -#define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ - cpu <= rnp->grphi; \ - cpu = cpumask_next((cpu), cpu_possible_mask)) - -/* * Union to allow "aggregate OR" operation on the need for a quiescent * state by the normal and expedited grace periods. */ @@ -335,34 +233,9 @@ struct rcu_data { /* period it is aware of. */ /* 2) batch handling */ - /* - * If nxtlist is not NULL, it is partitioned as follows. - * Any of the partitions might be empty, in which case the - * pointer to that partition will be equal to the pointer for - * the following partition. When the list is empty, all of - * the nxttail elements point to the ->nxtlist pointer itself, - * which in that case is NULL. - * - * [nxtlist, *nxttail[RCU_DONE_TAIL]): - * Entries that batch # <= ->completed - * The grace period for these entries has completed, and - * the other grace-period-completed entries may be moved - * here temporarily in rcu_process_callbacks(). - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * Note that the value of *nxttail[RCU_NEXT_TAIL] will - * always be NULL, as this is the end of the list. - */ - struct rcu_head *nxtlist; - struct rcu_head **nxttail[RCU_NEXT_SIZE]; - unsigned long nxtcompleted[RCU_NEXT_SIZE]; - /* grace periods for sublists. */ - long qlen_lazy; /* # of lazy queued callbacks */ - long qlen; /* # of queued callbacks, incl lazy */ + struct rcu_segcblist cblist; /* Segmented callback list, with */ + /* different callbacks waiting for */ + /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ @@ -481,7 +354,6 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS + 1]; /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ - u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ call_rcu_func_t call; /* call_rcu() flavor. */ int ncpus; /* # CPUs seen so far. */ @@ -501,14 +373,11 @@ struct rcu_state { raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; /* Protect following fields. */ - struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ + struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ /* need a grace period. */ - struct rcu_head **orphan_nxttail; /* Tail of above. */ - struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ + struct rcu_cblist orphan_done; /* Orphaned callbacks that */ /* are ready to invoke. */ - struct rcu_head **orphan_donetail; /* Tail of above. */ - long qlen_lazy; /* Number of lazy callbacks. */ - long qlen; /* Total number of callbacks. */ + /* (Contains counts.) */ /* End of fields guarded by orphan_lock. */ struct mutex barrier_mutex; /* Guards barrier fields. */ @@ -521,7 +390,6 @@ struct rcu_state { struct mutex exp_mutex; /* Serialize expedited GP. */ struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ @@ -595,6 +463,9 @@ extern struct rcu_state rcu_bh_state; extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ +int rcu_dynticks_snap(struct rcu_dynticks *rdtp); +bool rcu_eqs_special_set(int cpu); + #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); @@ -671,6 +542,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); +#ifdef CONFIG_SRCU +void srcu_online_cpu(unsigned int cpu); +void srcu_offline_cpu(unsigned int cpu); +#else /* #ifdef CONFIG_SRCU */ +void srcu_online_cpu(unsigned int cpu) { } +void srcu_offline_cpu(unsigned int cpu) { } +#endif /* #else #ifdef CONFIG_SRCU */ + #endif /* #ifndef RCU_TREE_NONCORE */ #ifdef CONFIG_RCU_TRACE @@ -688,18 +567,6 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #ifdef CONFIG_RCU_TRACE */ /* - * Place this after a lock-acquisition primitive to guarantee that - * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies - * if the UNLOCK and LOCK are executed by the same CPU or if the - * UNLOCK and LOCK operate on the same lock variable. - */ -#ifdef CONFIG_PPC -#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ -#else /* #ifdef CONFIG_PPC */ -#define smp_mb__after_unlock_lock() do { } while (0) -#endif /* #else #ifdef CONFIG_PPC */ - -/* * Wrappers for the rcu_node::lock acquire and release. * * Because the rcu_nodes form a tree, the tree traversal locking will observe diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e59e1849b89a..e513b4ab1197 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -20,16 +20,26 @@ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ -/* Wrapper functions for expedited grace periods. */ +/* + * Record the start of an expedited grace period. + */ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) { rcu_seq_start(&rsp->expedited_sequence); } + +/* + * Record the end of an expedited grace period. + */ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) { rcu_seq_end(&rsp->expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } + +/* + * Take a snapshot of the expedited-grace-period counter. + */ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { unsigned long s; @@ -39,6 +49,12 @@ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); return s; } + +/* + * Given a counter snapshot from rcu_exp_gp_seq_snap(), return true + * if a full expedited grace period has elapsed since that snapshot + * was taken. + */ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) { return rcu_seq_done(&rsp->expedited_sequence, s); @@ -276,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, rnp->grphi, TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(rsp, &rdp->exp_workdone2, s)); return true; @@ -315,6 +331,8 @@ static void sync_sched_exp_handler(void *data) return; } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); resched_cpu(smp_processor_id()); } @@ -356,12 +374,11 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, mask_ofl_test = 0; for_each_leaf_node_possible_cpu(rnp, cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); rdp->exp_dynticks_snap = - atomic_add_return(0, &rdtp->dynticks); + rcu_dynticks_snap(rdp->dynticks); if (raw_smp_processor_id() == cpu || - !(rdp->exp_dynticks_snap & 0x1) || + rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } @@ -380,13 +397,12 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); if (!(mask_ofl_ipi & mask)) continue; retry_ipi: - if (atomic_add_return(0, &rdtp->dynticks) != - rdp->exp_dynticks_snap) { + if (rcu_dynticks_in_eqs_since(rdp->dynticks, + rdp->exp_dynticks_snap)) { mask_ofl_test |= mask; continue; } @@ -517,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) rnp->exp_seq_rq = s; spin_unlock(&rnp->exp_lock); } - wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + smp_mb(); /* All above changes before wakeup. */ + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]); } trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); mutex_unlock(&rsp->exp_wake_mutex); @@ -595,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, /* Wait for expedited grace period to complete. */ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); rnp = rcu_get_root(rsp); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone0, s)); + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], + sync_exp_work_done(rsp, &rdp->exp_workdone0, s)); + smp_mb(); /* Workqueue actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rsp->exp_mutex); @@ -623,6 +640,11 @@ void synchronize_sched_expedited(void) { struct rcu_state *rsp = &rcu_sched_state; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched_expedited() in RCU read-side critical section"); + /* If only one CPU, this is automatically a grace period. */ if (rcu_blocking_is_gp()) return; @@ -692,6 +714,11 @@ void synchronize_rcu_expedited(void) { struct rcu_state *rsp = rcu_state_p; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); @@ -711,15 +738,3 @@ void synchronize_rcu_expedited(void) EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ - -/* - * Switch to run-time mode once Tree RCU has fully initialized. - */ -static int __init rcu_exp_runtime_mode(void) -{ - rcu_test_sync_prims(); - rcu_scheduler_active = RCU_SCHEDULER_RUNNING; - rcu_test_sync_prims(); - return 0; -} -core_initcall(rcu_exp_runtime_mode); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 56583e764ebf..c9a48657512a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -27,7 +27,9 @@ #include <linux/delay.h> #include <linux/gfp.h> #include <linux/oom.h> +#include <linux/sched/debug.h> #include <linux/smpboot.h> +#include <uapi/linux/sched/types.h> #include "../time/tick-internal.h" #ifdef CONFIG_RCU_BOOST @@ -1348,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) */ if ((rdp->completed != rnp->completed || unlikely(READ_ONCE(rdp->gpwrap))) && - rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) + rcu_segcblist_pend_cbs(&rdp->cblist)) note_gp_changes(rsp, rdp); - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) cbs_ready = true; } return cbs_ready; @@ -1459,7 +1461,7 @@ static void rcu_prepare_for_idle(void) rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (!*rdp->nxttail[RCU_DONE_TAIL]) + if (rcu_segcblist_pend_cbs(&rdp->cblist)) continue; rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ @@ -1527,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused) for_each_rcu_flavor(rsp) { rdp = raw_cpu_ptr(rsp->rda); - if (rdp->qlen_lazy != 0) { + if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) { atomic_inc(&oom_callback_count); rsp->call(&rdp->oom_head, rcu_oom_callback); } @@ -1643,7 +1645,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], ticks_value, ticks_title, - atomic_read(&rdtp->dynticks) & 0xfff, + rcu_dynticks_snap(rdtp) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, @@ -1707,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { - rcu_nocb_poll = 1; + rcu_nocb_poll = true; return 0; } early_param("rcu_nocb_poll", parse_rcu_nocb_poll); @@ -1858,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmptyIsDeferred")); } @@ -1870,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvfIsDeferred")); } @@ -1928,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long flags) { - long ql = rsp->qlen; - long qll = rsp->qlen_lazy; + long ql = rsp->orphan_done.len; + long qll = rsp->orphan_done.len_lazy; /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ if (!rcu_is_nocb_cpu(smp_processor_id())) return false; - rsp->qlen = 0; - rsp->qlen_lazy = 0; /* First, enqueue the donelist, if any. This preserves CB ordering. */ - if (rsp->orphan_donelist != NULL) { - __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, - rsp->orphan_donetail, ql, qll, flags); - ql = qll = 0; - rsp->orphan_donelist = NULL; - rsp->orphan_donetail = &rsp->orphan_donelist; + if (rsp->orphan_done.head) { + __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), + rcu_cblist_tail(&rsp->orphan_done), + ql, qll, flags); } - if (rsp->orphan_nxtlist != NULL) { - __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, - rsp->orphan_nxttail, ql, qll, flags); - ql = qll = 0; - rsp->orphan_nxtlist = NULL; - rsp->orphan_nxttail = &rsp->orphan_nxtlist; + if (rsp->orphan_pend.head) { + __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), + rcu_cblist_tail(&rsp->orphan_pend), + ql, qll, flags); } + rcu_cblist_init(&rsp->orphan_done); + rcu_cblist_init(&rsp->orphan_pend); return true; } @@ -2366,8 +2368,9 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) } /* - * Each pass through this loop sets up one rcu_data structure and - * spawns one rcu_nocb_kthread(). + * Each pass through this loop sets up one rcu_data structure. + * Should the corresponding CPU come online in the future, then + * we will spawn the needed set of rcu_nocb_kthread() kthreads. */ for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); @@ -2392,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) return false; /* If there are early-boot callbacks, move them to nocb lists. */ - if (rdp->nxtlist) { - rdp->nocb_head = rdp->nxtlist; - rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; - atomic_long_set(&rdp->nocb_q_count, rdp->qlen); - atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); - rdp->nxtlist = NULL; - rdp->qlen = 0; - rdp->qlen_lazy = 0; + if (!rcu_segcblist_empty(&rdp->cblist)) { + rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); + rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); + atomic_long_set(&rdp->nocb_q_count, + rcu_segcblist_n_cbs(&rdp->cblist)); + atomic_long_set(&rdp->nocb_q_count_lazy, + rcu_segcblist_n_lazy_cbs(&rdp->cblist)); + rcu_segcblist_init(&rdp->cblist); } - rdp->nxttail[RCU_NEXT_TAIL] = NULL; + rcu_segcblist_disable(&rdp->cblist); return true; } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index b1f28972872c..6cea17a1ea30 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -41,11 +41,11 @@ #include <linux/mutex.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/prefetch.h> #define RCU_TREE_NONCORE #include "tree.h" - -DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); +#include "rcu.h" static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) @@ -121,26 +121,24 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), rdp->cpu_no_qs.b.norm, - rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), + rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", - atomic_read(&rdp->dynticks->dynticks), + rcu_dynticks_snap(rdp->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); seq_printf(m, " of=%lu", rdp->offline_fqs); rcu_nocb_q_lengths(rdp, &ql, &qll); - qll += rdp->qlen_lazy; - ql += rdp->qlen; + qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist); + ql += rcu_segcblist_n_cbs(&rdp->cblist); seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", qll, ql, - ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != - rdp->nxttail[RCU_NEXT_TAIL]], - ".R"[rdp->nxttail[RCU_WAIT_TAIL] != - rdp->nxttail[RCU_NEXT_READY_TAIL]], - ".W"[rdp->nxttail[RCU_DONE_TAIL] != - rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); + ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)], + ".R"[!rcu_segcblist_segempty(&rdp->cblist, + RCU_NEXT_READY_TAIL)], + ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)], + ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]); #ifdef CONFIG_RCU_BOOST seq_printf(m, " kt=%d/%c ktl=%x", per_cpu(rcu_cpu_has_work, rdp->cpu), @@ -194,9 +192,8 @@ static int show_rcuexp(struct seq_file *m, void *v) s2 += atomic_long_read(&rdp->exp_workdone2); s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, s0, s1, s2, s3, - atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); return 0; @@ -279,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); + READ_ONCE(rsp->n_force_qs_lh), + rsp->orphan_done.len_lazy, + rsp->orphan_done.len); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4f6db7e6a117..273e869ca21d 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -36,7 +36,8 @@ #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/interrupt.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/percpu.h> @@ -49,6 +50,7 @@ #include <linux/moduleparam.h> #include <linux/kthread.h> #include <linux/tick.h> +#include <linux/rcupdate_wait.h> #define CREATE_TRACE_POINTS @@ -122,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); * non-expedited counterparts? Intended for use within RCU. Note * that if the user specifies both rcu_expedited and rcu_normal, then * rcu_normal wins. (Except during the time period during boot from - * when the first task is spawned until the rcu_exp_runtime_mode() + * when the first task is spawned until the rcu_set_runtime_mode() * core_initcall() is invoked, at which point everything is expedited.) */ bool rcu_gp_is_normal(void) @@ -132,8 +134,7 @@ bool rcu_gp_is_normal(void) } EXPORT_SYMBOL_GPL(rcu_gp_is_normal); -static atomic_t rcu_expedited_nesting = - ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); +static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); /* * Should normal grace-period primitives be expedited? Intended for @@ -182,14 +183,46 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); */ void rcu_end_inkernel_boot(void) { - if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) - rcu_unexpedite_gp(); + rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); } #endif /* #ifndef CONFIG_TINY_RCU */ +/* + * Test each non-SRCU synchronous grace-period wait API. This is + * useful just after a change in mode for these primitives, and + * during early boot. + */ +void rcu_test_sync_prims(void) +{ + if (!IS_ENABLED(CONFIG_PROVE_RCU)) + return; + synchronize_rcu(); + synchronize_rcu_bh(); + synchronize_sched(); + synchronize_rcu_expedited(); + synchronize_rcu_bh_expedited(); + synchronize_sched_expedited(); +} + +#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) + +/* + * Switch to run-time mode once RCU has fully initialized. + */ +static int __init rcu_set_runtime_mode(void) +{ + rcu_test_sync_prims(); + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + rcu_test_sync_prims(); + return 0; +} +core_initcall(rcu_set_runtime_mode); + +#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ + #ifdef CONFIG_PREEMPT_RCU /* @@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t, put_task_struct(t); return; } + rcu_request_urgent_qs_task(t); if (!needreport) return; if (*firstreport) { @@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void) #endif /* #ifdef CONFIG_TASKS_RCU */ -/* - * Test each non-SRCU synchronous grace-period wait API. This is - * useful just after a change in mode for these primitives, and - * during early boot. - */ -void rcu_test_sync_prims(void) -{ - if (!IS_ENABLED(CONFIG_PROVE_RCU)) - return; - synchronize_rcu(); - synchronize_rcu_bh(); - synchronize_sched(); - synchronize_rcu_expedited(); - synchronize_rcu_bh_expedited(); - synchronize_sched_expedited(); -} - #ifdef CONFIG_PROVE_RCU /* diff --git a/kernel/relay.c b/kernel/relay.c index 8f18d314a96a..39a9dfc69486 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -39,10 +39,10 @@ static void relay_file_mmap_close(struct vm_area_struct *vma) /* * fault() vm_op implementation for relay file mapping. */ -static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int relay_buf_fault(struct vm_fault *vmf) { struct page *page; - struct rchan_buf *buf = vma->vm_private_data; + struct rchan_buf *buf = vmf->vma->vm_private_data; pgoff_t pgoff = vmf->pgoff; if (!buf) @@ -847,7 +847,7 @@ void relay_close(struct rchan *chan) if (chan->last_toobig) printk(KERN_WARNING "relay: one or more items not logged " - "[item size (%Zd) > sub-buffer size (%Zd)]\n", + "[item size (%zd) > sub-buffer size (%zd)]\n", chan->last_toobig, chan->subbuf_size); list_del(&chan->list); @@ -1212,7 +1212,6 @@ static ssize_t subbuf_splice_actor(struct file *in, .nr_pages = 0, .nr_pages_max = PIPE_DEF_BUFFERS, .partial = partial, - .flags = flags, .ops = &relay_pipe_buf_ops, .spd_release = relay_page_release, }; diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5e59b832ae2b..89ab6758667b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -18,8 +18,8 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o swait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o -obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o +obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/autogroup.c index da39489d2d80..da39489d2d80 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/autogroup.c diff --git a/kernel/sched/auto_group.h b/kernel/sched/autogroup.h index 890c95f2587a..ce40c810cd5c 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/autogroup.h @@ -2,6 +2,7 @@ #include <linux/kref.h> #include <linux/rwsem.h> +#include <linux/sched/autogroup.h> struct autogroup { /* diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e85a725e5c34..00a45c45beca 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -58,6 +58,8 @@ #include <linux/percpu.h> #include <linux/ktime.h> #include <linux/sched.h> +#include <linux/nmi.h> +#include <linux/sched/clock.h> #include <linux/static_key.h> #include <linux/workqueue.h> #include <linux/compiler.h> @@ -77,91 +79,113 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; +void sched_clock_init(void) +{ + sched_clock_running = 1; +} + #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static struct static_key __sched_clock_stable = STATIC_KEY_INIT; -static int __sched_clock_stable_early; +/* + * We must start with !__sched_clock_stable because the unstable -> stable + * transition is accurate, while the stable -> unstable transition is not. + * + * Similarly we start with __sched_clock_stable_early, thereby assuming we + * will become stable, such that there's only a single 1 -> 0 transition. + */ +static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); +static int __sched_clock_stable_early = 1; -int sched_clock_stable(void) +/* + * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset + */ +__read_mostly u64 __sched_clock_offset; +static __read_mostly u64 __gtod_offset; + +struct sched_clock_data { + u64 tick_raw; + u64 tick_gtod; + u64 clock; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + +static inline struct sched_clock_data *this_scd(void) { - return static_key_false(&__sched_clock_stable); + return this_cpu_ptr(&sched_clock_data); } -static void __set_sched_clock_stable(void) +static inline struct sched_clock_data *cpu_sdc(int cpu) { - if (!sched_clock_stable()) - static_key_slow_inc(&__sched_clock_stable); + return &per_cpu(sched_clock_data, cpu); +} - tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); +int sched_clock_stable(void) +{ + return static_branch_likely(&__sched_clock_stable); } -void set_sched_clock_stable(void) +static void __set_sched_clock_stable(void) { - __sched_clock_stable_early = 1; + struct sched_clock_data *scd = this_scd(); - smp_mb(); /* matches sched_clock_init() */ + /* + * Attempt to make the (initial) unstable->stable transition continuous. + */ + __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); - if (!sched_clock_running) - return; + printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); - __set_sched_clock_stable(); + static_branch_enable(&__sched_clock_stable); + tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } -static void __clear_sched_clock_stable(struct work_struct *work) +static void __sched_clock_work(struct work_struct *work) { - /* XXX worry about clock continuity */ - if (sched_clock_stable()) - static_key_slow_dec(&__sched_clock_stable); - - tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); + static_branch_disable(&__sched_clock_stable); } -static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); +static DECLARE_WORK(sched_clock_work, __sched_clock_work); -void clear_sched_clock_stable(void) +static void __clear_sched_clock_stable(void) { - __sched_clock_stable_early = 0; - - smp_mb(); /* matches sched_clock_init() */ - - if (!sched_clock_running) - return; - - schedule_work(&sched_clock_work); -} + struct sched_clock_data *scd = this_scd(); -struct sched_clock_data { - u64 tick_raw; - u64 tick_gtod; - u64 clock; -}; + /* + * Attempt to make the stable->unstable transition continuous. + * + * Trouble is, this is typically called from the TSC watchdog + * timer, which is late per definition. This means the tick + * values can already be screwy. + * + * Still do what we can. + */ + __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod); -static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); -static inline struct sched_clock_data *this_scd(void) -{ - return this_cpu_ptr(&sched_clock_data); -} + tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); -static inline struct sched_clock_data *cpu_sdc(int cpu) -{ - return &per_cpu(sched_clock_data, cpu); + if (sched_clock_stable()) + schedule_work(&sched_clock_work); } -void sched_clock_init(void) +void clear_sched_clock_stable(void) { - u64 ktime_now = ktime_to_ns(ktime_get()); - int cpu; - - for_each_possible_cpu(cpu) { - struct sched_clock_data *scd = cpu_sdc(cpu); + __sched_clock_stable_early = 0; - scd->tick_raw = 0; - scd->tick_gtod = ktime_now; - scd->clock = ktime_now; - } + smp_mb(); /* matches sched_clock_init_late() */ - sched_clock_running = 1; + if (sched_clock_running == 2) + __clear_sched_clock_stable(); +} +void sched_clock_init_late(void) +{ + sched_clock_running = 2; /* * Ensure that it is impossible to not do a static_key update. * @@ -173,8 +197,6 @@ void sched_clock_init(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); - else - __clear_sched_clock_stable(NULL); } /* @@ -199,7 +221,7 @@ static inline u64 wrap_max(u64 x, u64 y) */ static u64 sched_clock_local(struct sched_clock_data *scd) { - u64 now, clock, old_clock, min_clock, max_clock; + u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; again: @@ -216,9 +238,10 @@ again: * scd->tick_gtod + TICK_NSEC); */ - clock = scd->tick_gtod + delta; - min_clock = wrap_max(scd->tick_gtod, old_clock); - max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); + gtod = scd->tick_gtod + __gtod_offset; + clock = gtod + delta; + min_clock = wrap_max(gtod, old_clock); + max_clock = wrap_max(old_clock, gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); @@ -302,7 +325,7 @@ u64 sched_clock_cpu(int cpu) u64 clock; if (sched_clock_stable()) - return sched_clock(); + return sched_clock() + __sched_clock_offset; if (unlikely(!sched_clock_running)) return 0ull; @@ -323,23 +346,22 @@ EXPORT_SYMBOL_GPL(sched_clock_cpu); void sched_clock_tick(void) { struct sched_clock_data *scd; - u64 now, now_gtod; - - if (sched_clock_stable()) - return; - - if (unlikely(!sched_clock_running)) - return; WARN_ON_ONCE(!irqs_disabled()); + /* + * Update these values even if sched_clock_stable(), because it can + * become unstable at any point in time at which point we need some + * values to fall back on. + * + * XXX arguably we can skip this if we expose tsc_clocksource_reliable + */ scd = this_scd(); - now_gtod = ktime_to_ns(ktime_get()); - now = sched_clock(); + scd->tick_raw = sched_clock(); + scd->tick_gtod = ktime_get_ns(); - scd->tick_raw = now; - scd->tick_gtod = now_gtod; - sched_clock_local(scd); + if (!sched_clock_stable() && likely(sched_clock_running)) + sched_clock_local(scd); } /* @@ -366,11 +388,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - u64 sched_clock_cpu(int cpu) { if (unlikely(!sched_clock_running)) @@ -378,6 +395,7 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 8d0f35debf35..53f9558fa925 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -11,7 +11,8 @@ * Waiting for completion is a typically sync point, but not an exclusion point. */ -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> #include <linux/completion.h> /** @@ -31,7 +32,8 @@ void complete(struct completion *x) unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); - x->done++; + if (x->done != UINT_MAX) + x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); spin_unlock_irqrestore(&x->wait.lock, flags); } @@ -51,7 +53,7 @@ void complete_all(struct completion *x) unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; + x->done = UINT_MAX; __wake_up_locked(&x->wait, TASK_NORMAL, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } @@ -79,7 +81,8 @@ do_wait_for_common(struct completion *x, if (!x->done) return timeout; } - x->done--; + if (x->done != UINT_MAX) + x->done--; return timeout ?: 1; } @@ -280,7 +283,7 @@ bool try_wait_for_completion(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; - else + else if (x->done != UINT_MAX) x->done--; spin_unlock_irqrestore(&x->wait.lock, flags); return ret; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c56fb57f2991..759f4bd52cd6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1,85 +1,33 @@ /* * kernel/sched/core.c * - * Kernel scheduler and related syscalls + * Core kernel scheduler code and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds - * - * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and - * make semaphores SMP safe - * 1998-11-19 Implemented schedule_timeout() and related stuff - * by Andrea Arcangeli - * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: - * hybrid priority-list and round-robin design with - * an array-switch method of distributing timeslices - * and per-CPU runqueues. Cleanups and useful suggestions - * by Davide Libenzi, preemptible kernel bits by Robert Love. - * 2003-09-03 Interactivity tuning by Con Kolivas. - * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-04-15 Work begun on replacing all interactivity tuning with a - * fair scheduling design by Con Kolivas. - * 2007-05-05 Load balancing (smp-nice) and other improvements - * by Peter Williams - * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith - * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri - * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, - * Thomas Gleixner, Mike Kravetz */ - -#include <linux/kasan.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/nmi.h> -#include <linux/init.h> -#include <linux/uaccess.h> -#include <linux/highmem.h> -#include <linux/mmu_context.h> -#include <linux/interrupt.h> -#include <linux/capability.h> -#include <linux/completion.h> -#include <linux/kernel_stat.h> -#include <linux/debug_locks.h> -#include <linux/perf_event.h> -#include <linux/security.h> -#include <linux/notifier.h> -#include <linux/profile.h> -#include <linux/freezer.h> -#include <linux/vmalloc.h> -#include <linux/blkdev.h> -#include <linux/delay.h> -#include <linux/pid_namespace.h> -#include <linux/smp.h> -#include <linux/threads.h> -#include <linux/timer.h> -#include <linux/rcupdate.h> -#include <linux/cpu.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <uapi/linux/sched/types.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/hotplug.h> #include <linux/cpuset.h> -#include <linux/percpu.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/sysctl.h> -#include <linux/syscalls.h> -#include <linux/times.h> -#include <linux/tsacct_kern.h> -#include <linux/kprobes.h> #include <linux/delayacct.h> -#include <linux/unistd.h> -#include <linux/pagemap.h> -#include <linux/hrtimer.h> -#include <linux/tick.h> -#include <linux/ctype.h> -#include <linux/ftrace.h> -#include <linux/slab.h> #include <linux/init_task.h> #include <linux/context_tracking.h> -#include <linux/compiler.h> -#include <linux/frame.h> +#include <linux/rcupdate_wait.h> + +#include <linux/blkdev.h> +#include <linux/kprobes.h> +#include <linux/mmu_context.h> +#include <linux/module.h> +#include <linux/nmi.h> #include <linux/prefetch.h> -#include <linux/mutex.h> +#include <linux/profile.h> +#include <linux/security.h> +#include <linux/syscalls.h> #include <asm/switch_to.h> #include <asm/tlb.h> -#include <asm/irq_regs.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif @@ -91,27 +39,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static void update_rq_clock_task(struct rq *rq, s64 delta); - -void update_rq_clock(struct rq *rq) -{ - s64 delta; - - lockdep_assert_held(&rq->lock); - - if (rq->clock_skip_update & RQCF_ACT_SKIP) - return; - - delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; - if (delta < 0) - return; - rq->clock += delta; - update_rq_clock_task(rq, delta); -} - /* * Debugging: various feature bits */ @@ -140,7 +69,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; /* - * period over which we measure -rt task cpu usage in us. + * period over which we measure -rt task CPU usage in us. * default: 1s */ unsigned int sysctl_sched_rt_period = 1000000; @@ -153,25 +82,10 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* cpus with isolated domains */ +/* CPUs with isolated domains */ cpumask_var_t cpu_isolated_map; /* - * this_rq_lock - lock this runqueue and disable interrupts. - */ -static struct rq *this_rq_lock(void) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_disable(); - rq = this_rq(); - raw_spin_lock(&rq->lock); - - return rq; -} - -/* * __task_rq_lock - lock the rq @p resides on. */ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) @@ -185,7 +99,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) rq = task_rq(p); raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, rf); return rq; } raw_spin_unlock(&rq->lock); @@ -221,11 +135,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * If we observe the old cpu in task_rq_lock, the acquire of * the old rq->lock will fully serialize against the stores. * - * If we observe the new cpu in task_rq_lock, the acquire will + * If we observe the new CPU in task_rq_lock, the acquire will * pair with the WMB to ensure we must then also see migrating. */ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, rf); return rq; } raw_spin_unlock(&rq->lock); @@ -236,6 +150,87 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) } } +/* + * RQ-clock updating methods: + */ + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + s64 steal = 0, irq_delta = 0; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}irq region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}irq + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; + + rq->prev_irq_time += irq_delta; + delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + if (static_key_false((¶virt_steal_rq_enabled))) { + steal = paravirt_steal_clock(cpu_of(rq)); + steal -= rq->prev_steal_time_rq; + + if (unlikely(steal > delta)) + steal = delta; + + rq->prev_steal_time_rq += steal; + delta -= steal; + } +#endif + + rq->clock_task += delta; + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) + sched_rt_avg_update(rq, irq_delta + steal); +#endif +} + +void update_rq_clock(struct rq *rq) +{ + s64 delta; + + lockdep_assert_held(&rq->lock); + + if (rq->clock_update_flags & RQCF_ACT_SKIP) + return; + +#ifdef CONFIG_SCHED_DEBUG + if (sched_feat(WARN_DOUBLE_CLOCK)) + SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); + rq->clock_update_flags |= RQCF_UPDATED; +#endif + + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + if (delta < 0) + return; + rq->clock += delta; + update_rq_clock_task(rq, delta); +} + + #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -254,13 +249,14 @@ static void hrtick_clear(struct rq *rq) static enum hrtimer_restart hrtick(struct hrtimer *timer) { struct rq *rq = container_of(timer, struct rq, hrtick_timer); + struct rq_flags rf; WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); return HRTIMER_NORESTART; } @@ -280,11 +276,12 @@ static void __hrtick_restart(struct rq *rq) static void __hrtick_start(void *arg) { struct rq *rq = arg; + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); __hrtick_restart(rq); rq->hrtick_csd_pending = 0; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -458,7 +455,7 @@ void wake_up_q(struct wake_q_head *head) task = container_of(node, struct task_struct, wake_q); BUG_ON(!task); - /* task can safely be re-inserted now */ + /* Task can safely be re-inserted now: */ node = node->next; task->wake_q.next = NULL; @@ -516,12 +513,12 @@ void resched_cpu(int cpu) #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* - * In the semi idle case, use the nearest busy cpu for migrating timers - * from an idle cpu. This is good for power-savings. + * In the semi idle case, use the nearest busy CPU for migrating timers + * from an idle CPU. This is good for power-savings. * * We don't do similar optimization for completely idle system, as - * selecting an idle cpu will add more delays to the timers than intended - * (as that cpu's timer base may not be uptodate wrt jiffies etc). + * selecting an idle CPU will add more delays to the timers than intended + * (as that CPU's timer base may not be uptodate wrt jiffies etc). */ int get_nohz_timer_target(void) { @@ -550,6 +547,7 @@ unlock: rcu_read_unlock(); return cpu; } + /* * When add_timer_on() enqueues a timer into the timer wheel of an * idle CPU then this timer might expire before the next timer event @@ -754,17 +752,23 @@ static void set_load_weight(struct task_struct *p) static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { - update_rq_clock(rq); + if (!(flags & ENQUEUE_NOCLOCK)) + update_rq_clock(rq); + if (!(flags & ENQUEUE_RESTORE)) sched_info_queued(rq, p); + p->sched_class->enqueue_task(rq, p, flags); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { - update_rq_clock(rq); + if (!(flags & DEQUEUE_NOCLOCK)) + update_rq_clock(rq); + if (!(flags & DEQUEUE_SAVE)) sched_info_dequeued(rq, p); + p->sched_class->dequeue_task(rq, p, flags); } @@ -784,60 +788,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -static void update_rq_clock_task(struct rq *rq, s64 delta) -{ -/* - * In theory, the compile should just see 0 here, and optimize out the call - * to sched_rt_avg_update. But I don't trust it... - */ -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - s64 steal = 0, irq_delta = 0; -#endif -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; - - /* - * Since irq_time is only updated on {soft,}irq_exit, we might run into - * this case when a previous update_rq_clock() happened inside a - * {soft,}irq region. - * - * When this happens, we stop ->clock_task and only update the - * prev_irq_time stamp to account for the part that fit, so that a next - * update will consume the rest. This ensures ->clock_task is - * monotonic. - * - * It does however cause some slight miss-attribution of {soft,}irq - * time, a more accurate solution would be to update the irq_time using - * the current rq->clock timestamp, except that would require using - * atomic ops. - */ - if (irq_delta > delta) - irq_delta = delta; - - rq->prev_irq_time += irq_delta; - delta -= irq_delta; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_key_false((¶virt_steal_rq_enabled))) { - steal = paravirt_steal_clock(cpu_of(rq)); - steal -= rq->prev_steal_time_rq; - - if (unlikely(steal > delta)) - steal = delta; - - rq->prev_steal_time_rq += steal; - delta -= steal; - } -#endif - - rq->clock_task += delta; - -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) - sched_rt_avg_update(rq, irq_delta + steal); -#endif -} - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -992,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * * Returns (locked) new rq. Old rq's lock is released. */ -static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) +static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int new_cpu) { lockdep_assert_held(&rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); rq = cpu_rq(new_cpu); - raw_spin_lock(&rq->lock); + rq_lock(rq, rf); BUG_ON(task_cpu(p) != new_cpu); enqueue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1018,7 +969,7 @@ struct migration_arg { }; /* - * Move (not current) task off this cpu, onto dest cpu. We're doing + * Move (not current) task off this CPU, onto the destination CPU. We're doing * this because either it can't run here any more (set_cpus_allowed() * away from this CPU, or CPU going down), or because we're * attempting to rebalance this task on exec (sched_exec). @@ -1026,16 +977,18 @@ struct migration_arg { * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. */ -static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) +static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int dest_cpu) { if (unlikely(!cpu_active(dest_cpu))) return rq; /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) return rq; - rq = move_queued_task(rq, p, dest_cpu); + update_rq_clock(rq); + rq = move_queued_task(rq, rf, p, dest_cpu); return rq; } @@ -1050,10 +1003,11 @@ static int migration_cpu_stop(void *data) struct migration_arg *arg = data; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + struct rq_flags rf; /* - * The original target cpu might have gone down and we might - * be on another cpu but it doesn't matter. + * The original target CPU might have gone down and we might + * be on another CPU but it doesn't matter. */ local_irq_disable(); /* @@ -1064,7 +1018,7 @@ static int migration_cpu_stop(void *data) sched_ttwu_pending(); raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because @@ -1072,11 +1026,11 @@ static int migration_cpu_stop(void *data) */ if (task_rq(p) == rq) { if (task_on_rq_queued(p)) - rq = __migrate_task(rq, p, arg->dest_cpu); + rq = __migrate_task(rq, &rf, p, arg->dest_cpu); else p->wake_cpu = arg->dest_cpu; } - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); raw_spin_unlock(&p->pi_lock); local_irq_enable(); @@ -1109,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * holding rq->lock. */ lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); } if (running) put_prev_task(rq, p); @@ -1117,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) set_curr_task(rq, p); } @@ -1141,6 +1095,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, int ret = 0; rq = task_rq_lock(p, &rf); + update_rq_clock(rq); if (p->flags & PF_KTHREAD) { /* @@ -1171,7 +1126,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (p->flags & PF_KTHREAD) { /* * For kernel threads that do indeed end up on online && - * !active we want to ensure they are strict per-cpu threads. + * !active we want to ensure they are strict per-CPU threads. */ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && !cpumask_intersects(new_mask, cpu_active_mask) && @@ -1195,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); - rq = move_queued_task(rq, p, dest_cpu); - lockdep_repin_lock(&rq->lock, rf.cookie); + rq = move_queued_task(rq, &rf, p, dest_cpu); } out: task_rq_unlock(rq, p, &rf); @@ -1262,21 +1215,29 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; + struct rq_flags srf, drf; src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + rq_pin_lock(src_rq, &srf); + rq_pin_lock(dst_rq, &drf); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); + + rq_unpin_lock(dst_rq, &drf); + rq_unpin_lock(src_rq, &srf); + } else { /* * Task isn't running anymore; make it appear like we migrated * it before it went to sleep. This means on wakeup we make the - * previous cpu our target instead of where it really is. + * previous CPU our target instead of where it really is. */ p->wake_cpu = cpu; } @@ -1309,10 +1270,10 @@ static int migrate_swap_stop(void *data) if (task_cpu(arg->src_task) != arg->src_cpu) goto unlock; - if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) + if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) goto unlock; - if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) + if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) goto unlock; __migrate_swap_task(arg->src_task, arg->dst_cpu); @@ -1353,10 +1314,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) goto out; - if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) + if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) goto out; - if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) + if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) goto out; trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); @@ -1508,12 +1469,12 @@ EXPORT_SYMBOL_GPL(kick_process); * * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, * see __set_cpus_allowed_ptr(). At this point the newly online - * cpu isn't yet part of the sched domains, and balancing will not + * CPU isn't yet part of the sched domains, and balancing will not * see it. * - * - on cpu-down we clear cpu_active() to mask the sched domains and + * - on CPU-down we clear cpu_active() to mask the sched domains and * avoid the load balancer to place new tasks on the to be removed - * cpu. Existing tasks will remain running there and will be taken + * CPU. Existing tasks will remain running there and will be taken * off. * * This means that fallback selection must not select !active CPUs. @@ -1529,9 +1490,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) int dest_cpu; /* - * If the node that the cpu is on has been offlined, cpu_to_node() - * will return -1. There is no cpu on the node, and we should - * select the cpu on the other node. + * If the node that the CPU is on has been offlined, cpu_to_node() + * will return -1. There is no CPU on the node, and we should + * select the CPU on the other node. */ if (nid != -1) { nodemask = cpumask_of_node(nid); @@ -1540,14 +1501,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, nodemask) { if (!cpu_active(dest_cpu)) continue; - if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) return dest_cpu; } } for (;;) { /* Any allowed, online CPU? */ - for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { + for_each_cpu(dest_cpu, &p->cpus_allowed) { if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) continue; if (!cpu_online(dest_cpu)) @@ -1563,7 +1524,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) state = possible; break; } - /* fall-through */ + /* Fall-through */ case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; @@ -1599,22 +1560,22 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { lockdep_assert_held(&p->pi_lock); - if (tsk_nr_cpus_allowed(p) > 1) + if (p->nr_cpus_allowed > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); else - cpu = cpumask_any(tsk_cpus_allowed(p)); + cpu = cpumask_any(&p->cpus_allowed); /* * In order not to call set_task_cpu() on a blocking task we need * to rely on ttwu() to place the task on a valid ->cpus_allowed - * cpu. + * CPU. * * Since this is common to all placement strategies, this lives here. * * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || !cpu_online(cpu))) cpu = select_fallback_rq(task_cpu(p), p); @@ -1681,7 +1642,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl activate_task(rq, p, en_flags); p->on_rq = TASK_ON_RQ_QUEUED; - /* if a worker is waking up, notify workqueue */ + /* If a worker is waking up, notify the workqueue: */ if (p->flags & PF_WQ_WORKER) wq_worker_waking_up(p, cpu_of(rq)); } @@ -1690,7 +1651,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl * Mark the task runnable and perform wakeup-preemption. */ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, - struct pin_cookie cookie) + struct rq_flags *rf) { check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -1702,9 +1663,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, * Our task @p is fully woken up and running; so its safe to * drop the rq->lock, hereafter rq is only used for statistics. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); p->sched_class->task_woken(rq, p); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); } if (rq->idle_stamp) { @@ -1723,9 +1684,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, - struct pin_cookie cookie) + struct rq_flags *rf) { - int en_flags = ENQUEUE_WAKEUP; + int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; lockdep_assert_held(&rq->lock); @@ -1738,7 +1699,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, #endif ttwu_activate(rq, p, en_flags); - ttwu_do_wakeup(rq, p, wake_flags, cookie); + ttwu_do_wakeup(rq, p, wake_flags, rf); } /* @@ -1757,7 +1718,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); + ttwu_do_wakeup(rq, p, wake_flags, &rf); ret = 1; } __task_rq_unlock(rq, &rf); @@ -1770,15 +1731,14 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); - struct pin_cookie cookie; struct task_struct *p; - unsigned long flags; + struct rq_flags rf; if (!llist) return; - raw_spin_lock_irqsave(&rq->lock, flags); - cookie = lockdep_pin_lock(&rq->lock); + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); while (llist) { int wake_flags = 0; @@ -1789,11 +1749,10 @@ void sched_ttwu_pending(void) if (p->sched_remote_wakeup) wake_flags = WF_MIGRATED; - ttwu_do_activate(rq, p, wake_flags, cookie); + ttwu_do_activate(rq, p, wake_flags, &rf); } - lockdep_unpin_lock(&rq->lock, cookie); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } void scheduler_ipi(void) @@ -1851,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) void wake_up_if_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; rcu_read_lock(); @@ -1861,11 +1820,11 @@ void wake_up_if_idle(int cpu) if (set_nr_if_polling(rq->idle)) { trace_sched_wake_idle_without_ipi(cpu); } else { - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); if (is_idle_task(rq->curr)) smp_send_reschedule(cpu); - /* Else cpu is not in idle, do nothing here */ - raw_spin_unlock_irqrestore(&rq->lock, flags); + /* Else CPU is not idle, do nothing here: */ + rq_unlock_irqrestore(rq, &rf); } out: @@ -1881,21 +1840,20 @@ bool cpus_share_cache(int this_cpu, int that_cpu) static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); - struct pin_cookie cookie; + struct rq_flags rf; #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { - sched_clock_cpu(cpu); /* sync clocks x-cpu */ + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ttwu_queue_remote(p, cpu, wake_flags); return; } #endif - raw_spin_lock(&rq->lock); - cookie = lockdep_pin_lock(&rq->lock); - ttwu_do_activate(rq, p, wake_flags, cookie); - lockdep_unpin_lock(&rq->lock, cookie); - raw_spin_unlock(&rq->lock); + rq_lock(rq, &rf); + update_rq_clock(rq); + ttwu_do_activate(rq, p, wake_flags, &rf); + rq_unlock(rq, &rf); } /* @@ -1904,8 +1862,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * MIGRATION * * The basic program-order guarantee on SMP systems is that when a task [t] - * migrates, all its activity on its old cpu [c0] happens-before any subsequent - * execution on its new cpu [c1]. + * migrates, all its activity on its old CPU [c0] happens-before any subsequent + * execution on its new CPU [c1]. * * For migration (of runnable tasks) this is provided by the following means: * @@ -1916,7 +1874,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * * Transitivity guarantees that B happens after A and C after B. * Note: we only require RCpc transitivity. - * Note: the cpu doing B need not be c0 or c1 + * Note: the CPU doing B need not be c0 or c1 * * Example: * @@ -2024,7 +1982,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) trace_sched_waking(p); - success = 1; /* we're going to change ->state */ + /* We're going to change ->state: */ + success = 1; cpu = task_cpu(p); /* @@ -2073,7 +2032,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) smp_rmb(); /* - * If the owning (remote) cpu is still in the middle of schedule() with + * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. * * Pairs with the smp_store_release() in finish_lock_switch(). @@ -2086,11 +2045,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&task_rq(p)->nr_iowait); + } + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } + +#else /* CONFIG_SMP */ + + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&task_rq(p)->nr_iowait); + } + #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2111,7 +2083,7 @@ out: * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. */ -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) +static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) { struct rq *rq = task_rq(p); @@ -2128,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - lockdep_unpin_lock(&rq->lock, cookie); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - lockdep_repin_lock(&rq->lock, cookie); + rq_relock(rq, rf); } if (!(p->state & TASK_NORMAL)) @@ -2140,10 +2110,15 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie trace_sched_waking(p); - if (!task_on_rq_queued(p)) - ttwu_activate(rq, p, ENQUEUE_WAKEUP); + if (!task_on_rq_queued(p)) { + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&rq->nr_iowait); + } + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); + } - ttwu_do_wakeup(rq, p, 0, cookie); + ttwu_do_wakeup(rq, p, 0, rf); ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); @@ -2427,7 +2402,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) */ raw_spin_lock_irqsave(&p->pi_lock, flags); /* - * We're setting the cpu for the first time, we don't migrate, + * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). */ __set_task_cpu(p, cpu); @@ -2570,7 +2545,7 @@ void wake_up_new_task(struct task_struct *p) /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path - * - any previously selected cpu might disappear through hotplug + * - any previously selected CPU might disappear through hotplug * * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, * as we're not fully set-up yet. @@ -2578,9 +2553,10 @@ void wake_up_new_task(struct task_struct *p) __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); + update_rq_clock(rq); post_init_entity_util_avg(&p->se); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -2590,9 +2566,9 @@ void wake_up_new_task(struct task_struct *p) * Nothing relies on rq->lock after this, so its fine to * drop it. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + rq_unpin_lock(rq, &rf); p->sched_class->task_woken(rq, p); - lockdep_repin_lock(&rq->lock, rf.cookie); + rq_repin_lock(rq, &rf); } #endif task_rq_unlock(rq, p, &rf); @@ -2861,7 +2837,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next, struct pin_cookie cookie) + struct task_struct *next, struct rq_flags *rf) { struct mm_struct *mm, *oldmm; @@ -2878,7 +2854,7 @@ context_switch(struct rq *rq, struct task_struct *prev, if (!mm) { next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); + mmgrab(oldmm); enter_lazy_tlb(oldmm, next); } else switch_mm_irqs_off(oldmm, mm, next); @@ -2887,13 +2863,16 @@ context_switch(struct rq *rq, struct task_struct *prev, prev->active_mm = NULL; rq->prev_mm = oldmm; } + + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); + /* * Since the runqueue lock will be released by the next * task (which is an invalid locking op but in the case * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ @@ -2920,7 +2899,7 @@ unsigned long nr_running(void) } /* - * Check if only the current task is running on the cpu. + * Check if only the current task is running on the CPU. * * Caution: this function does not check that the caller has disabled * preemption, thus the result might have a time-of-check-to-time-of-use @@ -2949,6 +2928,36 @@ unsigned long long nr_context_switches(void) return sum; } +/* + * IO-wait accounting, and how its mostly bollocks (on SMP). + * + * The idea behind IO-wait account is to account the idle time that we could + * have spend running if it were not for IO. That is, if we were to improve the + * storage performance, we'd have a proportional reduction in IO-wait time. + * + * This all works nicely on UP, where, when a task blocks on IO, we account + * idle time as IO-wait, because if the storage were faster, it could've been + * running and we'd not be idle. + * + * This has been extended to SMP, by doing the same for each CPU. This however + * is broken. + * + * Imagine for instance the case where two tasks block on one CPU, only the one + * CPU will have IO-wait accounted, while the other has regular idle. Even + * though, if the storage were faster, both could've ran at the same time, + * utilising both CPUs. + * + * This means, that when looking globally, the current IO-wait accounting on + * SMP is a lower bound, by reason of under accounting. + * + * Worse, since the numbers are provided per CPU, they are sometimes + * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly + * associated with any one particular CPU, it can wake to another CPU than it + * blocked on. This means the per CPU IO-wait number is meaningless. + * + * Task CPU affinities can make all that even more 'interesting'. + */ + unsigned long nr_iowait(void) { unsigned long i, sum = 0; @@ -2959,6 +2968,13 @@ unsigned long nr_iowait(void) return sum; } +/* + * Consumers of these two interfaces, like for example the cpufreq menu + * governor are using nonsensical data. Boosting frequency for a CPU that has + * IO-wait which might not even end up running the task when it does become + * runnable. + */ + unsigned long nr_iowait_cpu(int cpu) { struct rq *this = cpu_rq(cpu); @@ -3042,8 +3058,8 @@ unsigned long long task_sched_runtime(struct task_struct *p) * So we have a optimization chance when the task's delta_exec is 0. * Reading ->on_cpu is racy, but this is ok. * - * If we race with it leaving cpu, we'll take a lock. So we're correct. - * If we race with it entering cpu, unaccounted time is 0. This is + * If we race with it leaving CPU, we'll take a lock. So we're correct. + * If we race with it entering CPU, unaccounted time is 0. This is * indistinguishable from the read occurring a few cycles earlier. * If we see ->on_cpu without ->on_rq, the task is leaving, and has * been accounted, so we're correct here as well. @@ -3078,15 +3094,18 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; + struct rq_flags rf; sched_clock_tick(); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); + update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); - raw_spin_unlock(&rq->lock); + + rq_unlock(rq, &rf); perf_event_task_tick(); @@ -3201,6 +3220,15 @@ static inline void preempt_latency_start(int val) { } static inline void preempt_latency_stop(int val) { } #endif +static inline unsigned long get_preempt_disable_ip(struct task_struct *p) +{ +#ifdef CONFIG_DEBUG_PREEMPT + return p->preempt_disable_ip; +#else + return 0; +#endif +} + /* * Print scheduling while atomic bug: */ @@ -3257,31 +3285,35 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - const struct sched_class *class = &fair_sched_class; + const struct sched_class *class; struct task_struct *p; /* - * Optimization: we know that if all tasks are in - * the fair class we can call that function directly: + * Optimization: we know that if all tasks are in the fair class we can + * call that function directly, but only if the @prev task wasn't of a + * higher scheduling class, because otherwise those loose the + * opportunity to pull in more work from other CPUs. */ - if (likely(prev->sched_class == class && + if (likely((prev->sched_class == &idle_sched_class || + prev->sched_class == &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev, cookie); + + p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto again; - /* assumes fair_sched_class->next == idle_sched_class */ + /* Assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev, cookie); + p = idle_sched_class.pick_next_task(rq, prev, rf); return p; } again: for_each_class(class) { - p = class->pick_next_task(rq, prev, cookie); + p = class->pick_next_task(rq, prev, rf); if (p) { if (unlikely(p == RETRY_TASK)) goto again; @@ -3289,7 +3321,8 @@ again: } } - BUG(); /* the idle class will always have a runnable task */ + /* The idle class should always have a runnable task: */ + BUG(); } /* @@ -3335,7 +3368,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; - struct pin_cookie cookie; + struct rq_flags rf; struct rq *rq; int cpu; @@ -3349,7 +3382,7 @@ static void __sched notrace __schedule(bool preempt) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(); + rcu_note_context_switch(preempt); /* * Make sure that signal_pending_state()->signal_pending() below @@ -3357,19 +3390,25 @@ static void __sched notrace __schedule(bool preempt) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock(&rq->lock); - cookie = lockdep_pin_lock(&rq->lock); + rq_lock(rq, &rf); - rq->clock_skip_update <<= 1; /* promote REQ to ACT */ + /* Promote REQ to ACT */ + rq->clock_update_flags <<= 1; + update_rq_clock(rq); switch_count = &prev->nivcsw; if (!preempt && prev->state) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { - deactivate_task(rq, prev, DEQUEUE_SLEEP); + deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); prev->on_rq = 0; + if (prev->in_iowait) { + atomic_inc(&rq->nr_iowait); + delayacct_blkio_start(); + } + /* * If a worker went to sleep, notify and ask workqueue * whether it wants to wake up a task to maintain @@ -3380,19 +3419,15 @@ static void __sched notrace __schedule(bool preempt) to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) - try_to_wake_up_local(to_wakeup, cookie); + try_to_wake_up_local(to_wakeup, &rf); } } switch_count = &prev->nvcsw; } - if (task_on_rq_queued(prev)) - update_rq_clock(rq); - - next = pick_next_task(rq, prev, cookie); + next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); - rq->clock_skip_update = 0; if (likely(prev != next)) { rq->nr_switches++; @@ -3400,10 +3435,12 @@ static void __sched notrace __schedule(bool preempt) ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ + + /* Also unlocks the rq: */ + rq = context_switch(rq, prev, next, &rf); } else { - lockdep_unpin_lock(&rq->lock, cookie); - raw_spin_unlock_irq(&rq->lock); + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); + rq_unlock_irq(rq, &rf); } balance_callback(rq); @@ -3426,14 +3463,18 @@ void __noreturn do_task_dead(void) smp_mb(); raw_spin_unlock_wait(¤t->pi_lock); - /* causes final put_task_struct in finish_task_switch(). */ + /* Causes final put_task_struct in finish_task_switch(): */ __set_current_state(TASK_DEAD); - current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ + + /* Tell freezer to ignore us: */ + current->flags |= PF_NOFREEZE; + __schedule(false); BUG(); - /* Avoid "noreturn function does return". */ + + /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ for (;;) - cpu_relax(); /* For when BUG is null */ + cpu_relax(); } static inline void sched_submit_work(struct task_struct *tsk) @@ -3630,10 +3671,25 @@ EXPORT_SYMBOL(default_wake_function); #ifdef CONFIG_RT_MUTEXES +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) +{ + if (pi_task) + prio = min(prio, pi_task->prio); + + return prio; +} + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + return __rt_effective_prio(pi_task, prio); +} + /* * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) + * @p: task to boost + * @pi_task: donor task * * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). @@ -3641,16 +3697,42 @@ EXPORT_SYMBOL(default_wake_function); * Used by the rt_mutex code to implement priority inheritance * logic. Call site only calls if the priority of the task changed. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; + int prio, oldprio, queued, running, queue_flag = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class; struct rq_flags rf; struct rq *rq; - BUG_ON(prio > MAX_PRIO); + /* XXX used to be waiter->prio, not waiter->task->prio */ + prio = __rt_effective_prio(pi_task, p->normal_prio); + + /* + * If nothing changed; bail early. + */ + if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) + return; rq = __task_rq_lock(p, &rf); + update_rq_clock(rq); + /* + * Set under pi_lock && rq->lock, such that the value can be used under + * either lock. + * + * Note that there is loads of tricky to make this pointer cache work + * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to + * ensure a task is de-boosted (pi_task is set to NULL) before the + * task is allowed to run again (and can exit). This ensures the pointer + * points to a blocked task -- which guaratees the task is present. + */ + p->pi_top_task = pi_task; + + /* + * For FIFO/RR we only need to set prio, if that matches we're done. + */ + if (prio == p->prio && !dl_prio(prio)) + goto out_unlock; /* * Idle task boosting is a nono in general. There is one @@ -3670,7 +3752,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) goto out_unlock; } - trace_sched_pi_setprio(p, prio); + trace_sched_pi_setprio(p, pi_task); oldprio = p->prio; if (oldprio == prio) @@ -3694,7 +3776,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) * running task */ if (dl_prio(prio)) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; @@ -3725,12 +3806,18 @@ void rt_mutex_setprio(struct task_struct *p, int prio) check_class_changed(rq, p, prev_class, oldprio); out_unlock: - preempt_disable(); /* avoid rq from going away on us */ + /* Avoid rq from going away on us: */ + preempt_disable(); __task_rq_unlock(rq, &rf); balance_callback(rq); preempt_enable(); } +#else +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + return prio; +} #endif void set_user_nice(struct task_struct *p, long nice) @@ -3747,6 +3834,8 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -3760,7 +3849,7 @@ void set_user_nice(struct task_struct *p, long nice) queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); if (running) put_prev_task(rq, p); @@ -3771,7 +3860,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (queued) { - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3793,7 +3882,7 @@ EXPORT_SYMBOL(set_user_nice); */ int can_nice(const struct task_struct *p, const int nice) { - /* convert nice value [19,-20] to rlimit style value [1,40] */ + /* Convert nice value [19,-20] to rlimit style value [1,40]: */ int nice_rlim = nice_to_rlimit(nice); return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || @@ -3849,7 +3938,7 @@ int task_prio(const struct task_struct *p) } /** - * idle_cpu - is a given cpu idle currently? + * idle_cpu - is a given CPU idle currently? * @cpu: the processor in question. * * Return: 1 if the CPU is currently idle. 0 otherwise. @@ -3873,10 +3962,10 @@ int idle_cpu(int cpu) } /** - * idle_task - return the idle task for a given cpu. + * idle_task - return the idle task for a given CPU. * @cpu: the processor in question. * - * Return: The idle task for the cpu @cpu. + * Return: The idle task for the CPU @cpu. */ struct task_struct *idle_task(int cpu) { @@ -3975,10 +4064,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, * Keep a potential priority boosting if called from * sched_setscheduler(). */ + p->prio = normal_prio(p); if (keep_boost) - p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); - else - p->prio = normal_prio(p); + p->prio = rt_effective_prio(p, p->prio); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -4042,7 +4130,7 @@ __checkparam_dl(const struct sched_attr *attr) } /* - * check the target process has a UID that matches the current process's + * Check the target process has a UID that matches the current process's: */ static bool check_same_owner(struct task_struct *p) { @@ -4057,8 +4145,7 @@ static bool check_same_owner(struct task_struct *p) return match; } -static bool dl_param_changed(struct task_struct *p, - const struct sched_attr *attr) +static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; @@ -4082,13 +4169,13 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq_flags rf; int reset_on_fork; - int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; - /* may grab non-irq protected spin_locks */ + /* May grab non-irq protected spin_locks: */ BUG_ON(in_interrupt()); recheck: - /* double check policy once rq lock held */ + /* Double check policy once rq lock held: */ if (policy < 0) { reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; @@ -4128,11 +4215,11 @@ recheck: unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); - /* can't set/change the rt policy */ + /* Can't set/change the rt policy: */ if (policy != p->policy && !rlim_rtprio) return -EPERM; - /* can't increase priority */ + /* Can't increase priority: */ if (attr->sched_priority > p->rt_priority && attr->sched_priority > rlim_rtprio) return -EPERM; @@ -4156,11 +4243,11 @@ recheck: return -EPERM; } - /* can't change other user's priorities */ + /* Can't change other user's priorities: */ if (!check_same_owner(p)) return -EPERM; - /* Normal users shall not reset the sched_reset_on_fork flag */ + /* Normal users shall not reset the sched_reset_on_fork flag: */ if (p->sched_reset_on_fork && !reset_on_fork) return -EPERM; } @@ -4172,16 +4259,17 @@ recheck: } /* - * make sure no PI-waiters arrive (or leave) while we are + * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: * * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ rq = task_rq_lock(p, &rf); + update_rq_clock(rq); /* - * Changing the policy of the stop threads its a very bad idea + * Changing the policy of the stop threads its a very bad idea: */ if (p == rq->stop) { task_rq_unlock(rq, p, &rf); @@ -4237,7 +4325,7 @@ change: #endif } - /* recheck policy now with rq lock held */ + /* Re-check policy now with rq lock held: */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); @@ -4265,7 +4353,7 @@ change: * the runqueue. This will be done when the task deboost * itself. */ - new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + new_effective_prio = rt_effective_prio(p, newprio); if (new_effective_prio == oldprio) queue_flags &= ~DEQUEUE_MOVE; } @@ -4294,15 +4382,15 @@ change: set_curr_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); - preempt_disable(); /* avoid rq from going away on us */ + + /* Avoid rq from going away on us: */ + preempt_disable(); task_rq_unlock(rq, p, &rf); if (pi) rt_mutex_adjust_pi(p); - /* - * Run balance callbacks after we've adjusted the PI chain. - */ + /* Run balance callbacks after we've adjusted the PI chain: */ balance_callback(rq); preempt_enable(); @@ -4395,8 +4483,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) /* * Mimics kernel/events/core.c perf_copy_attr(). */ -static int sched_copy_attr(struct sched_attr __user *uattr, - struct sched_attr *attr) +static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) { u32 size; int ret; @@ -4404,19 +4491,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) return -EFAULT; - /* - * zero the full structure, so that a short copy will be nice. - */ + /* Zero the full structure, so that a short copy will be nice: */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; - if (size > PAGE_SIZE) /* silly large */ + /* Bail out on silly large: */ + if (size > PAGE_SIZE) goto err_size; - if (!size) /* abi compat */ + /* ABI compatibility quirk: */ + if (!size) size = SCHED_ATTR_SIZE_VER0; if (size < SCHED_ATTR_SIZE_VER0) @@ -4451,7 +4538,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, return -EFAULT; /* - * XXX: do we want to be lenient like existing syscalls; or do we want + * XXX: Do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? */ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); @@ -4471,10 +4558,8 @@ err_size: * * Return: 0 on success. An error code otherwise. */ -SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, - struct sched_param __user *, param) +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) { - /* negative values for policy are not valid */ if (policy < 0) return -EINVAL; @@ -4784,10 +4869,10 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, } /** - * sys_sched_setaffinity - set the cpu affinity of a process + * sys_sched_setaffinity - set the CPU affinity of a process * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new cpu mask + * @user_mask_ptr: user-space pointer to the new CPU mask * * Return: 0 on success. An error code otherwise. */ @@ -4835,10 +4920,10 @@ out_unlock: } /** - * sys_sched_getaffinity - get the cpu affinity of a process + * sys_sched_getaffinity - get the CPU affinity of a process * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current cpu mask + * @user_mask_ptr: user-space pointer to hold the current CPU mask * * Return: size of CPU mask copied to user_mask_ptr on success. An * error code otherwise. @@ -4881,7 +4966,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, */ SYSCALL_DEFINE0(sched_yield) { - struct rq *rq = this_rq_lock(); + struct rq_flags rf; + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, &rf); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); @@ -4890,9 +4980,8 @@ SYSCALL_DEFINE0(sched_yield) * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&rq->lock); + preempt_disable(); + rq_unlock(rq, &rf); sched_preempt_enable_no_resched(); schedule(); @@ -4966,7 +5055,7 @@ EXPORT_SYMBOL(__cond_resched_softirq); * Typical broken usage is: * * while (!event) - * yield(); + * yield(); * * where one assumes that yield() will let 'the other' process run that will * make event true. If the current task is a SCHED_FIFO task that will never @@ -5057,31 +5146,48 @@ out_irq: } EXPORT_SYMBOL_GPL(yield_to); +int io_schedule_prepare(void) +{ + int old_iowait = current->in_iowait; + + current->in_iowait = 1; + blk_schedule_flush_plug(current); + + return old_iowait; +} + +void io_schedule_finish(int token) +{ + current->in_iowait = token; +} + /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. */ long __sched io_schedule_timeout(long timeout) { - int old_iowait = current->in_iowait; - struct rq *rq; + int token; long ret; - current->in_iowait = 1; - blk_schedule_flush_plug(current); - - delayacct_blkio_start(); - rq = raw_rq(); - atomic_inc(&rq->nr_iowait); + token = io_schedule_prepare(); ret = schedule_timeout(timeout); - current->in_iowait = old_iowait; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); + io_schedule_finish(token); return ret; } EXPORT_SYMBOL(io_schedule_timeout); +void io_schedule(void) +{ + int token; + + token = io_schedule_prepare(); + schedule(); + io_schedule_finish(token); +} +EXPORT_SYMBOL(io_schedule); + /** * sys_sched_get_priority_max - return maximum RT priority. * @policy: scheduling class. @@ -5193,6 +5299,9 @@ void sched_show_task(struct task_struct *p) int ppid; unsigned long state = p->state; + /* Make sure the string lines up properly with the number of task states: */ + BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); + if (!try_get_task_stack(p)) return; if (state) @@ -5264,7 +5373,7 @@ void init_idle_bootup_task(struct task_struct *idle) /** * init_idle - set up an idle thread for a given CPU * @idle: task in question - * @cpu: cpu the idle task belongs to + * @cpu: CPU the idle task belongs to * * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. @@ -5295,7 +5404,7 @@ void init_idle(struct task_struct *idle, int cpu) #endif /* * We're having a chicken and egg problem, even though we are - * holding rq->lock, the cpu isn't yet set to this cpu so the + * holding rq->lock, the CPU isn't yet set to this CPU so the * lockdep check in task_group() will fail. * * Similar case to sched_fork(). / Alternatively we could @@ -5360,7 +5469,7 @@ int task_can_attach(struct task_struct *p, /* * Kthreads which disallow setaffinity shouldn't be moved - * to a new cpuset; we don't want to change their cpu + * to a new cpuset; we don't want to change their CPU * affinity and isolating such threads by their set of * allowed nodes is unnecessary. Thus, cpusets are not * applicable for such threads. This prevents checking for @@ -5409,7 +5518,7 @@ out: #ifdef CONFIG_SMP -static bool sched_smp_initialized __read_mostly; +bool sched_smp_initialized __read_mostly; #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ @@ -5421,7 +5530,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (curr_cpu == target_cpu) return 0; - if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) + if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) return -EINVAL; /* TODO: This is not properly updating schedstats */ @@ -5452,7 +5561,7 @@ void sched_setnuma(struct task_struct *p, int nid) p->numa_preferred_nid = nid; if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) set_curr_task(rq, p); task_rq_unlock(rq, p, &rf); @@ -5461,7 +5570,7 @@ void sched_setnuma(struct task_struct *p, int nid) #ifdef CONFIG_HOTPLUG_CPU /* - * Ensures that the idle task is using init_mm right before its cpu goes + * Ensure that the idle task is using init_mm right before its CPU goes * offline. */ void idle_task_exit(void) @@ -5517,11 +5626,11 @@ static struct task_struct fake_task = { * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; - struct pin_cookie cookie; + struct rq_flags orf = *rf; int dest_cpu; /* @@ -5545,16 +5654,15 @@ static void migrate_tasks(struct rq *dead_rq) for (;;) { /* * There's this thread running, bail when that's the only - * remaining thread. + * remaining thread: */ if (rq->nr_running == 1) break; /* - * pick_next_task assumes pinned rq->lock. + * pick_next_task() assumes pinned rq->lock: */ - cookie = lockdep_pin_lock(&rq->lock); - next = pick_next_task(rq, &fake_task, cookie); + next = pick_next_task(rq, &fake_task, rf); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5567,10 +5675,9 @@ static void migrate_tasks(struct rq *dead_rq) * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - lockdep_unpin_lock(&rq->lock, cookie); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); raw_spin_lock(&next->pi_lock); - raw_spin_lock(&rq->lock); + rq_relock(rq, rf); /* * Since we're inside stop-machine, _nothing_ should have @@ -5584,12 +5691,12 @@ static void migrate_tasks(struct rq *dead_rq) /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); - - rq = __migrate_task(rq, next, dest_cpu); + rq = __migrate_task(rq, rf, next, dest_cpu); if (rq != dead_rq) { - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); rq = dead_rq; - raw_spin_lock(&rq->lock); + *rf = orf; + rq_relock(rq, rf); } raw_spin_unlock(&next->pi_lock); } @@ -5598,7 +5705,7 @@ static void migrate_tasks(struct rq *dead_rq) } #endif /* CONFIG_HOTPLUG_CPU */ -static void set_rq_online(struct rq *rq) +void set_rq_online(struct rq *rq) { if (!rq->online) { const struct sched_class *class; @@ -5613,7 +5720,7 @@ static void set_rq_online(struct rq *rq) } } -static void set_rq_offline(struct rq *rq) +void set_rq_offline(struct rq *rq) { if (rq->online) { const struct sched_class *class; @@ -5635,1647 +5742,10 @@ static void set_cpu_rq_start_time(unsigned int cpu) rq->age_stamp = sched_clock_cpu(cpu); } -static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ - -#ifdef CONFIG_SCHED_DEBUG - -static __read_mostly int sched_debug_enabled; - -static int __init sched_debug_setup(char *str) -{ - sched_debug_enabled = 1; - - return 0; -} -early_param("sched_debug", sched_debug_setup); - -static inline bool sched_debug(void) -{ - return sched_debug_enabled; -} - -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - struct cpumask *groupmask) -{ - struct sched_group *group = sd->groups; - - cpumask_clear(groupmask); - - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); - return -1; - } - - printk(KERN_CONT "span %*pbl level %s\n", - cpumask_pr_args(sched_domain_span(sd)), sd->name); - - if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); - } - if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); - } - - printk(KERN_DEBUG "%*s groups:", level + 1, ""); - do { - if (!group) { - printk("\n"); - printk(KERN_ERR "ERROR: group is NULL\n"); - break; - } - - if (!cpumask_weight(sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: empty group\n"); - break; - } - - if (!(sd->flags & SD_OVERLAP) && - cpumask_intersects(groupmask, sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: repeated CPUs\n"); - break; - } - - cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - - printk(KERN_CONT " %*pbl", - cpumask_pr_args(sched_group_cpus(group))); - if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %lu)", - group->sgc->capacity); - } - - group = group->next; - } while (group != sd->groups); - printk(KERN_CONT "\n"); - - if (!cpumask_equal(sched_domain_span(sd), groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - - if (sd->parent && - !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); - return 0; -} - -static void sched_domain_debug(struct sched_domain *sd, int cpu) -{ - int level = 0; - - if (!sched_debug_enabled) - return; - - if (!sd) { - printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); - return; - } - - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) - break; - level++; - sd = sd->parent; - if (!sd) - break; - } -} -#else /* !CONFIG_SCHED_DEBUG */ - -# define sched_debug_enabled 0 -# define sched_domain_debug(sd, cpu) do { } while (0) -static inline bool sched_debug(void) -{ - return false; -} -#endif /* CONFIG_SCHED_DEBUG */ - -static int sd_degenerate(struct sched_domain *sd) -{ - if (cpumask_weight(sched_domain_span(sd)) == 1) - return 1; - - /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUCAPACITY | - SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { - if (sd->groups != sd->groups->next) - return 0; - } - - /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_AFFINE)) - return 0; - - return 1; -} - -static int -sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) -{ - unsigned long cflags = sd->flags, pflags = parent->flags; - - if (sd_degenerate(parent)) - return 1; - - if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) - return 0; - - /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); - if (nr_node_ids == 1) - pflags &= ~SD_SERIALIZE; - } - if (~cflags & pflags) - return 0; - - return 1; -} - -static void free_rootdomain(struct rcu_head *rcu) -{ - struct root_domain *rd = container_of(rcu, struct root_domain, rcu); - - cpupri_cleanup(&rd->cpupri); - cpudl_cleanup(&rd->cpudl); - free_cpumask_var(rd->dlo_mask); - free_cpumask_var(rd->rto_mask); - free_cpumask_var(rd->online); - free_cpumask_var(rd->span); - kfree(rd); -} - -static void rq_attach_root(struct rq *rq, struct root_domain *rd) -{ - struct root_domain *old_rd = NULL; - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - - if (rq->rd) { - old_rd = rq->rd; - - if (cpumask_test_cpu(rq->cpu, old_rd->online)) - set_rq_offline(rq); - - cpumask_clear_cpu(rq->cpu, old_rd->span); - - /* - * If we dont want to free the old_rd yet then - * set old_rd to NULL to skip the freeing later - * in this function: - */ - if (!atomic_dec_and_test(&old_rd->refcount)) - old_rd = NULL; - } - - atomic_inc(&rd->refcount); - rq->rd = rd; - - cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); - - raw_spin_unlock_irqrestore(&rq->lock, flags); - - if (old_rd) - call_rcu_sched(&old_rd->rcu, free_rootdomain); -} - -static int init_rootdomain(struct root_domain *rd) -{ - memset(rd, 0, sizeof(*rd)); - - if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) - goto out; - if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) - goto free_span; - if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) - goto free_online; - if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) - goto free_dlo_mask; - - init_dl_bw(&rd->dl_bw); - if (cpudl_init(&rd->cpudl) != 0) - goto free_dlo_mask; - - if (cpupri_init(&rd->cpupri) != 0) - goto free_rto_mask; - return 0; - -free_rto_mask: - free_cpumask_var(rd->rto_mask); -free_dlo_mask: - free_cpumask_var(rd->dlo_mask); -free_online: - free_cpumask_var(rd->online); -free_span: - free_cpumask_var(rd->span); -out: - return -ENOMEM; -} - -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -struct root_domain def_root_domain; - -static void init_defrootdomain(void) -{ - init_rootdomain(&def_root_domain); - - atomic_set(&def_root_domain.refcount, 1); -} - -static struct root_domain *alloc_rootdomain(void) -{ - struct root_domain *rd; - - rd = kmalloc(sizeof(*rd), GFP_KERNEL); - if (!rd) - return NULL; - - if (init_rootdomain(rd) != 0) { - kfree(rd); - return NULL; - } - - return rd; -} - -static void free_sched_groups(struct sched_group *sg, int free_sgc) -{ - struct sched_group *tmp, *first; - - if (!sg) - return; - - first = sg; - do { - tmp = sg->next; - - if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) - kfree(sg->sgc); - - kfree(sg); - sg = tmp; - } while (sg != first); -} - -static void destroy_sched_domain(struct sched_domain *sd) -{ - /* - * If its an overlapping domain it has private groups, iterate and - * nuke them all. - */ - if (sd->flags & SD_OVERLAP) { - free_sched_groups(sd->groups, 1); - } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgc); - kfree(sd->groups); - } - if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) - kfree(sd->shared); - kfree(sd); -} - -static void destroy_sched_domains_rcu(struct rcu_head *rcu) -{ - struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - - while (sd) { - struct sched_domain *parent = sd->parent; - destroy_sched_domain(sd); - sd = parent; - } -} - -static void destroy_sched_domains(struct sched_domain *sd) -{ - if (sd) - call_rcu(&sd->rcu, destroy_sched_domains_rcu); -} - /* - * Keep a special pointer to the highest sched_domain that has - * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this - * allows us to avoid some pointer chasing select_idle_sibling(). - * - * Also keep a unique ID per domain (we use the first cpu number in - * the cpumask of the domain), this allows us to quickly tell if - * two cpus are in the same cache domain, see cpus_share_cache(). + * used to mark begin/end of suspend/resume: */ -DEFINE_PER_CPU(struct sched_domain *, sd_llc); -DEFINE_PER_CPU(int, sd_llc_size); -DEFINE_PER_CPU(int, sd_llc_id); -DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_asym); - -static void update_top_cache_domain(int cpu) -{ - struct sched_domain_shared *sds = NULL; - struct sched_domain *sd; - int id = cpu; - int size = 1; - - sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) { - id = cpumask_first(sched_domain_span(sd)); - size = cpumask_weight(sched_domain_span(sd)); - sds = sd->shared; - } - - rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); - per_cpu(sd_llc_size, cpu) = size; - per_cpu(sd_llc_id, cpu) = id; - rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); - - sd = lowest_flag_domain(cpu, SD_NUMA); - rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); - - sd = highest_flag_domain(cpu, SD_ASYM_PACKING); - rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); -} - -/* - * Attach the domain 'sd' to 'cpu' as its base domain. Callers must - * hold the hotplug lock. - */ -static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; - - /* Remove the sched domains which do not contribute to scheduling. */ - for (tmp = sd; tmp; ) { - struct sched_domain *parent = tmp->parent; - if (!parent) - break; - - if (sd_parent_degenerate(tmp, parent)) { - tmp->parent = parent->parent; - if (parent->parent) - parent->parent->child = tmp; - /* - * Transfer SD_PREFER_SIBLING down in case of a - * degenerate parent; the spans match for this - * so the property transfers. - */ - if (parent->flags & SD_PREFER_SIBLING) - tmp->flags |= SD_PREFER_SIBLING; - destroy_sched_domain(parent); - } else - tmp = tmp->parent; - } - - if (sd && sd_degenerate(sd)) { - tmp = sd; - sd = sd->parent; - destroy_sched_domain(tmp); - if (sd) - sd->child = NULL; - } - - sched_domain_debug(sd, cpu); - - rq_attach_root(rq, rd); - tmp = rq->sd; - rcu_assign_pointer(rq->sd, sd); - destroy_sched_domains(tmp); - - update_top_cache_domain(cpu); -} - -/* Setup the mask of cpus configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - int ret; - - alloc_bootmem_cpumask_var(&cpu_isolated_map); - ret = cpulist_parse(str, cpu_isolated_map); - if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); - return 0; - } - return 1; -} -__setup("isolcpus=", isolated_cpu_setup); - -struct s_data { - struct sched_domain ** __percpu sd; - struct root_domain *rd; -}; - -enum s_alloc { - sa_rootdomain, - sa_sd, - sa_sd_storage, - sa_none, -}; - -/* - * Build an iteration mask that can exclude certain CPUs from the upwards - * domain traversal. - * - * Asymmetric node setups can result in situations where the domain tree is of - * unequal depth, make sure to skip domains that already cover the entire - * range. - * - * In that case build_sched_domains() will have terminated the iteration early - * and our sibling sd spans will be empty. Domains should always include the - * cpu they're built on, so check that. - * - */ -static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) -{ - const struct cpumask *span = sched_domain_span(sd); - struct sd_data *sdd = sd->private; - struct sched_domain *sibling; - int i; - - for_each_cpu(i, span) { - sibling = *per_cpu_ptr(sdd->sd, i); - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) - continue; - - cpumask_set_cpu(i, sched_group_mask(sg)); - } -} - -/* - * Return the canonical balance cpu for this group, this is the first cpu - * of this group that's also in the iteration mask. - */ -int group_balance_cpu(struct sched_group *sg) -{ - return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); -} - -static int -build_overlap_sched_groups(struct sched_domain *sd, int cpu) -{ - struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; - const struct cpumask *span = sched_domain_span(sd); - struct cpumask *covered = sched_domains_tmpmask; - struct sd_data *sdd = sd->private; - struct sched_domain *sibling; - int i; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct cpumask *sg_span; - - if (cpumask_test_cpu(i, covered)) - continue; - - sibling = *per_cpu_ptr(sdd->sd, i); - - /* See the comment near build_group_mask(). */ - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) - continue; - - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(cpu)); - - if (!sg) - goto fail; - - sg_span = sched_group_cpus(sg); - if (sibling->child) - cpumask_copy(sg_span, sched_domain_span(sibling->child)); - else - cpumask_set_cpu(i, sg_span); - - cpumask_or(covered, covered, sg_span); - - sg->sgc = *per_cpu_ptr(sdd->sgc, i); - if (atomic_inc_return(&sg->sgc->ref) == 1) - build_group_mask(sd, sg); - - /* - * Initialize sgc->capacity such that even if we mess up the - * domains and no possible iteration will get us here, we won't - * die on a /0 trap. - */ - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; - - /* - * Make sure the first group of this domain contains the - * canonical balance cpu. Otherwise the sched_domain iteration - * breaks. See update_sg_lb_stats(). - */ - if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - group_balance_cpu(sg) == cpu) - groups = sg; - - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - last->next = first; - } - sd->groups = groups; - - return 0; - -fail: - free_sched_groups(first, 0); - - return -ENOMEM; -} - -static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) -{ - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); - struct sched_domain *child = sd->child; - - if (child) - cpu = cpumask_first(sched_domain_span(child)); - - if (sg) { - *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); - atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ - } - - return cpu; -} - -/* - * build_sched_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_capacity to 0. - * - * Assumes the sched_domain tree is fully constructed - */ -static int -build_sched_groups(struct sched_domain *sd, int cpu) -{ - struct sched_group *first = NULL, *last = NULL; - struct sd_data *sdd = sd->private; - const struct cpumask *span = sched_domain_span(sd); - struct cpumask *covered; - int i; - - get_group(cpu, sdd, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (cpu != cpumask_first(span)) - return 0; - - lockdep_assert_held(&sched_domains_mutex); - covered = sched_domains_tmpmask; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct sched_group *sg; - int group, j; - - if (cpumask_test_cpu(i, covered)) - continue; - - group = get_group(i, sdd, &sg); - cpumask_setall(sched_group_mask(sg)); - - for_each_cpu(j, span) { - if (get_group(j, sdd, NULL) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } - - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; - - return 0; -} - -/* - * Initialize sched groups cpu_capacity. - * - * cpu_capacity indicates the capacity of sched group, which is used while - * distributing the load between different sched groups in a sched domain. - * Typically cpu_capacity for all the groups in a sched domain will be same - * unless there are asymmetries in the topology. If there are asymmetries, - * group having more cpu_capacity will pickup more load compared to the - * group having less cpu_capacity. - */ -static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) -{ - struct sched_group *sg = sd->groups; - - WARN_ON(!sg); - - do { - int cpu, max_cpu = -1; - - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); - - if (!(sd->flags & SD_ASYM_PACKING)) - goto next; - - for_each_cpu(cpu, sched_group_cpus(sg)) { - if (max_cpu < 0) - max_cpu = cpu; - else if (sched_asym_prefer(cpu, max_cpu)) - max_cpu = cpu; - } - sg->asym_prefer_cpu = max_cpu; - -next: - sg = sg->next; - } while (sg != sd->groups); - - if (cpu != group_balance_cpu(sg)) - return; - - update_group_capacity(sd, cpu); -} - -/* - * Initializers for schedule domains - * Non-inlined to reduce accumulated stack pressure in build_sched_domains() - */ - -static int default_relax_domain_level = -1; -int sched_domain_level_max; - -static int __init setup_relax_domain_level(char *str) -{ - if (kstrtoint(str, 0, &default_relax_domain_level)) - pr_warn("Unable to set relax_domain_level\n"); - - return 1; -} -__setup("relax_domain_level=", setup_relax_domain_level); - -static void set_domain_attribute(struct sched_domain *sd, - struct sched_domain_attr *attr) -{ - int request; - - if (!attr || attr->relax_domain_level < 0) { - if (default_relax_domain_level < 0) - return; - else - request = default_relax_domain_level; - } else - request = attr->relax_domain_level; - if (request < sd->level) { - /* turn off idle balance on this domain */ - sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } else { - /* turn on idle balance on this domain */ - sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } -} - -static void __sdt_free(const struct cpumask *cpu_map); -static int __sdt_alloc(const struct cpumask *cpu_map); - -static void __free_domain_allocs(struct s_data *d, enum s_alloc what, - const struct cpumask *cpu_map) -{ - switch (what) { - case sa_rootdomain: - if (!atomic_read(&d->rd->refcount)) - free_rootdomain(&d->rd->rcu); /* fall through */ - case sa_sd: - free_percpu(d->sd); /* fall through */ - case sa_sd_storage: - __sdt_free(cpu_map); /* fall through */ - case sa_none: - break; - } -} - -static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, - const struct cpumask *cpu_map) -{ - memset(d, 0, sizeof(*d)); - - if (__sdt_alloc(cpu_map)) - return sa_sd_storage; - d->sd = alloc_percpu(struct sched_domain *); - if (!d->sd) - return sa_sd_storage; - d->rd = alloc_rootdomain(); - if (!d->rd) - return sa_sd; - return sa_rootdomain; -} - -/* - * NULL the sd_data elements we've used to build the sched_domain and - * sched_group structure so that the subsequent __free_domain_allocs() - * will not free the data we're using. - */ -static void claim_allocations(int cpu, struct sched_domain *sd) -{ - struct sd_data *sdd = sd->private; - - WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); - *per_cpu_ptr(sdd->sd, cpu) = NULL; - - if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) - *per_cpu_ptr(sdd->sds, cpu) = NULL; - - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) - *per_cpu_ptr(sdd->sg, cpu) = NULL; - - if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) - *per_cpu_ptr(sdd->sgc, cpu) = NULL; -} - -#ifdef CONFIG_NUMA -static int sched_domains_numa_levels; -enum numa_topology_type sched_numa_topology_type; -static int *sched_domains_numa_distance; -int sched_max_numa_distance; -static struct cpumask ***sched_domains_numa_masks; -static int sched_domains_curr_level; -#endif - -/* - * SD_flags allowed in topology descriptions. - * - * These flags are purely descriptive of the topology and do not prescribe - * behaviour. Behaviour is artificial and mapped in the below sd_init() - * function: - * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain - * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies - * - * Odd one out, which beside describing the topology has a quirk also - * prescribes the desired behaviour that goes along with it: - * - * SD_ASYM_PACKING - describes SMT quirks - */ -#define TOPOLOGY_SD_FLAGS \ - (SD_SHARE_CPUCAPACITY | \ - SD_SHARE_PKG_RESOURCES | \ - SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_ASYM_CPUCAPACITY | \ - SD_SHARE_POWERDOMAIN) - -static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, - struct sched_domain *child, int cpu) -{ - struct sd_data *sdd = &tl->data; - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); - int sd_id, sd_weight, sd_flags = 0; - -#ifdef CONFIG_NUMA - /* - * Ugly hack to pass state to sd_numa_mask()... - */ - sched_domains_curr_level = tl->numa_level; -#endif - - sd_weight = cpumask_weight(tl->mask(cpu)); - - if (tl->sd_flags) - sd_flags = (*tl->sd_flags)(); - if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, - "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; - - *sd = (struct sched_domain){ - .min_interval = sd_weight, - .max_interval = 2*sd_weight, - .busy_factor = 32, - .imbalance_pct = 125, - - .cache_nice_tries = 0, - .busy_idx = 0, - .idle_idx = 0, - .newidle_idx = 0, - .wake_idx = 0, - .forkexec_idx = 0, - - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE - | 1*SD_BALANCE_EXEC - | 1*SD_BALANCE_FORK - | 0*SD_BALANCE_WAKE - | 1*SD_WAKE_AFFINE - | 0*SD_SHARE_CPUCAPACITY - | 0*SD_SHARE_PKG_RESOURCES - | 0*SD_SERIALIZE - | 0*SD_PREFER_SIBLING - | 0*SD_NUMA - | sd_flags - , - - .last_balance = jiffies, - .balance_interval = sd_weight, - .smt_gain = 0, - .max_newidle_lb_cost = 0, - .next_decay_max_lb_cost = jiffies, - .child = child, -#ifdef CONFIG_SCHED_DEBUG - .name = tl->name, -#endif - }; - - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - sd_id = cpumask_first(sched_domain_span(sd)); - - /* - * Convert topological properties into behaviour. - */ - - if (sd->flags & SD_ASYM_CPUCAPACITY) { - struct sched_domain *t = sd; - - for_each_lower_domain(t) - t->flags |= SD_BALANCE_WAKE; - } - - if (sd->flags & SD_SHARE_CPUCAPACITY) { - sd->flags |= SD_PREFER_SIBLING; - sd->imbalance_pct = 110; - sd->smt_gain = 1178; /* ~15% */ - - } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->imbalance_pct = 117; - sd->cache_nice_tries = 1; - sd->busy_idx = 2; - -#ifdef CONFIG_NUMA - } else if (sd->flags & SD_NUMA) { - sd->cache_nice_tries = 2; - sd->busy_idx = 3; - sd->idle_idx = 2; - - sd->flags |= SD_SERIALIZE; - if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { - sd->flags &= ~(SD_BALANCE_EXEC | - SD_BALANCE_FORK | - SD_WAKE_AFFINE); - } - -#endif - } else { - sd->flags |= SD_PREFER_SIBLING; - sd->cache_nice_tries = 1; - sd->busy_idx = 2; - sd->idle_idx = 1; - } - - /* - * For all levels sharing cache; connect a sched_domain_shared - * instance. - */ - if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->shared = *per_cpu_ptr(sdd->sds, sd_id); - atomic_inc(&sd->shared->ref); - atomic_set(&sd->shared->nr_busy_cpus, sd_weight); - } - - sd->private = sdd; - - return sd; -} - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, -#endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = - default_topology; - -#define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->mask; tl++) - -void set_sched_topology(struct sched_domain_topology_level *tl) -{ - if (WARN_ON_ONCE(sched_smp_initialized)) - return; - - sched_domain_topology = tl; -} - -#ifdef CONFIG_NUMA - -static const struct cpumask *sd_numa_mask(int cpu) -{ - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; -} - -static void sched_numa_warn(const char *str) -{ - static int done = false; - int i,j; - - if (done) - return; - - done = true; - - printk(KERN_WARNING "ERROR: %s\n\n", str); - - for (i = 0; i < nr_node_ids; i++) { - printk(KERN_WARNING " "); - for (j = 0; j < nr_node_ids; j++) - printk(KERN_CONT "%02d ", node_distance(i,j)); - printk(KERN_CONT "\n"); - } - printk(KERN_WARNING "\n"); -} - -bool find_numa_distance(int distance) -{ - int i; - - if (distance == node_distance(0, 0)) - return true; - - for (i = 0; i < sched_domains_numa_levels; i++) { - if (sched_domains_numa_distance[i] == distance) - return true; - } - - return false; -} - -/* - * A system can have three types of NUMA topology: - * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system - * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes - * NUMA_BACKPLANE: nodes can reach other nodes through a backplane - * - * The difference between a glueless mesh topology and a backplane - * topology lies in whether communication between not directly - * connected nodes goes through intermediary nodes (where programs - * could run), or through backplane controllers. This affects - * placement of programs. - * - * The type of topology can be discerned with the following tests: - * - If the maximum distance between any nodes is 1 hop, the system - * is directly connected. - * - If for two nodes A and B, located N > 1 hops away from each other, - * there is an intermediary node C, which is < N hops away from both - * nodes A and B, the system is a glueless mesh. - */ -static void init_numa_topology_type(void) -{ - int a, b, c, n; - - n = sched_max_numa_distance; - - if (sched_domains_numa_levels <= 1) { - sched_numa_topology_type = NUMA_DIRECT; - return; - } - - for_each_online_node(a) { - for_each_online_node(b) { - /* Find two nodes furthest removed from each other. */ - if (node_distance(a, b) < n) - continue; - - /* Is there an intermediary node between a and b? */ - for_each_online_node(c) { - if (node_distance(a, c) < n && - node_distance(b, c) < n) { - sched_numa_topology_type = - NUMA_GLUELESS_MESH; - return; - } - } - - sched_numa_topology_type = NUMA_BACKPLANE; - return; - } - } -} - -static void sched_init_numa(void) -{ - int next_distance, curr_distance = node_distance(0, 0); - struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - - sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); - if (!sched_domains_numa_distance) - return; - - /* - * O(nr_nodes^2) deduplicating selection sort -- in order to find the - * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. - */ - next_distance = curr_distance; - for (i = 0; i < nr_node_ids; i++) { - for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; - - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); - - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); - } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; - } - - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; - } - - if (!level) - return; - - /* - * 'level' contains the number of unique distances, excluding the - * identity distance node_distance(i,i). - * - * The sched_domains_numa_distance[] array includes the actual distance - * numbers. - */ - - /* - * Here, we should temporarily reset sched_domains_numa_levels to 0. - * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be - * dangerous when we use it to iterate array sched_domains_numa_masks[][] - * in other functions. - * - * We reset it to 'level' at the end of this function. - */ - sched_domains_numa_levels = 0; - - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); - if (!sched_domains_numa_masks) - return; - - /* - * Now for each level, construct a mask per node which contains all - * cpus of nodes that are that many hops away from us. - */ - for (i = 0; i < level; i++) { - sched_domains_numa_masks[i] = - kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); - if (!sched_domains_numa_masks[i]) - return; - - for (j = 0; j < nr_node_ids; j++) { - struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!mask) - return; - - sched_domains_numa_masks[i][j] = mask; - - for_each_node(k) { - if (node_distance(j, k) > sched_domains_numa_distance[i]) - continue; - - cpumask_or(mask, mask, cpumask_of_node(k)); - } - } - } - - /* Compute default topology size */ - for (i = 0; sched_domain_topology[i].mask; i++); - - tl = kzalloc((i + level + 1) * - sizeof(struct sched_domain_topology_level), GFP_KERNEL); - if (!tl) - return; - - /* - * Copy the default topology bits.. - */ - for (i = 0; sched_domain_topology[i].mask; i++) - tl[i] = sched_domain_topology[i]; - - /* - * .. and append 'j' levels of NUMA goodness. - */ - for (j = 0; j < level; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, - .flags = SDTL_OVERLAP, - .numa_level = j, - SD_INIT_NAME(NUMA) - }; - } - - sched_domain_topology = tl; - - sched_domains_numa_levels = level; - sched_max_numa_distance = sched_domains_numa_distance[level - 1]; - - init_numa_topology_type(); -} - -static void sched_domains_numa_masks_set(unsigned int cpu) -{ - int node = cpu_to_node(cpu); - int i, j; - - for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) { - if (node_distance(j, node) <= sched_domains_numa_distance[i]) - cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); - } - } -} - -static void sched_domains_numa_masks_clear(unsigned int cpu) -{ - int i, j; - - for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) - cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); - } -} - -#else -static inline void sched_init_numa(void) { } -static void sched_domains_numa_masks_set(unsigned int cpu) { } -static void sched_domains_numa_masks_clear(unsigned int cpu) { } -#endif /* CONFIG_NUMA */ - -static int __sdt_alloc(const struct cpumask *cpu_map) -{ - struct sched_domain_topology_level *tl; - int j; - - for_each_sd_topology(tl) { - struct sd_data *sdd = &tl->data; - - sdd->sd = alloc_percpu(struct sched_domain *); - if (!sdd->sd) - return -ENOMEM; - - sdd->sds = alloc_percpu(struct sched_domain_shared *); - if (!sdd->sds) - return -ENOMEM; - - sdd->sg = alloc_percpu(struct sched_group *); - if (!sdd->sg) - return -ENOMEM; - - sdd->sgc = alloc_percpu(struct sched_group_capacity *); - if (!sdd->sgc) - return -ENOMEM; - - for_each_cpu(j, cpu_map) { - struct sched_domain *sd; - struct sched_domain_shared *sds; - struct sched_group *sg; - struct sched_group_capacity *sgc; - - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), - GFP_KERNEL, cpu_to_node(j)); - if (!sd) - return -ENOMEM; - - *per_cpu_ptr(sdd->sd, j) = sd; - - sds = kzalloc_node(sizeof(struct sched_domain_shared), - GFP_KERNEL, cpu_to_node(j)); - if (!sds) - return -ENOMEM; - - *per_cpu_ptr(sdd->sds, j) = sds; - - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(j)); - if (!sg) - return -ENOMEM; - - sg->next = sg; - - *per_cpu_ptr(sdd->sg, j) = sg; - - sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), - GFP_KERNEL, cpu_to_node(j)); - if (!sgc) - return -ENOMEM; - - *per_cpu_ptr(sdd->sgc, j) = sgc; - } - } - - return 0; -} - -static void __sdt_free(const struct cpumask *cpu_map) -{ - struct sched_domain_topology_level *tl; - int j; - - for_each_sd_topology(tl) { - struct sd_data *sdd = &tl->data; - - for_each_cpu(j, cpu_map) { - struct sched_domain *sd; - - if (sdd->sd) { - sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) - free_sched_groups(sd->groups, 0); - kfree(*per_cpu_ptr(sdd->sd, j)); - } - - if (sdd->sds) - kfree(*per_cpu_ptr(sdd->sds, j)); - if (sdd->sg) - kfree(*per_cpu_ptr(sdd->sg, j)); - if (sdd->sgc) - kfree(*per_cpu_ptr(sdd->sgc, j)); - } - free_percpu(sdd->sd); - sdd->sd = NULL; - free_percpu(sdd->sds); - sdd->sds = NULL; - free_percpu(sdd->sg); - sdd->sg = NULL; - free_percpu(sdd->sgc); - sdd->sgc = NULL; - } -} - -struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *child, int cpu) -{ - struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); - - if (child) { - sd->level = child->level + 1; - sched_domain_level_max = max(sched_domain_level_max, sd->level); - child->parent = sd; - - if (!cpumask_subset(sched_domain_span(child), - sched_domain_span(sd))) { - pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG - pr_err(" the %s domain not a subset of the %s domain\n", - child->name, sd->name); -#endif - /* Fixup, ensure @sd has at least @child cpus. */ - cpumask_or(sched_domain_span(sd), - sched_domain_span(sd), - sched_domain_span(child)); - } - - } - set_domain_attribute(sd, attr); - - return sd; -} - -/* - * Build sched domains for a given set of cpus and attach the sched domains - * to the individual cpus - */ -static int build_sched_domains(const struct cpumask *cpu_map, - struct sched_domain_attr *attr) -{ - enum s_alloc alloc_state; - struct sched_domain *sd; - struct s_data d; - struct rq *rq = NULL; - int i, ret = -ENOMEM; - - alloc_state = __visit_domain_allocation_hell(&d, cpu_map); - if (alloc_state != sa_rootdomain) - goto error; - - /* Set up domains for cpus specified by the cpu_map. */ - for_each_cpu(i, cpu_map) { - struct sched_domain_topology_level *tl; - - sd = NULL; - for_each_sd_topology(tl) { - sd = build_sched_domain(tl, cpu_map, attr, sd, i); - if (tl == sched_domain_topology) - *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) - sd->flags |= SD_OVERLAP; - if (cpumask_equal(cpu_map, sched_domain_span(sd))) - break; - } - } - - /* Build the groups for the domains */ - for_each_cpu(i, cpu_map) { - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - sd->span_weight = cpumask_weight(sched_domain_span(sd)); - if (sd->flags & SD_OVERLAP) { - if (build_overlap_sched_groups(sd, i)) - goto error; - } else { - if (build_sched_groups(sd, i)) - goto error; - } - } - } - - /* Calculate CPU capacity for physical packages and nodes */ - for (i = nr_cpumask_bits-1; i >= 0; i--) { - if (!cpumask_test_cpu(i, cpu_map)) - continue; - - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - claim_allocations(i, sd); - init_sched_groups_capacity(i, sd); - } - } - - /* Attach the domains */ - rcu_read_lock(); - for_each_cpu(i, cpu_map) { - rq = cpu_rq(i); - sd = *per_cpu_ptr(d.sd, i); - - /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); - - cpu_attach_domain(sd, d.rd, i); - } - rcu_read_unlock(); - - if (rq && sched_debug_enabled) { - pr_info("span: %*pbl (max cpu_capacity = %lu)\n", - cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); - } - - ret = 0; -error: - __free_domain_allocs(&d, alloc_state, cpu_map); - return ret; -} - -static cpumask_var_t *doms_cur; /* current sched domains */ -static int ndoms_cur; /* number of sched domains in 'doms_cur' */ -static struct sched_domain_attr *dattr_cur; - /* attribues of custom domains in 'doms_cur' */ - -/* - * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask) fails, then fallback to a single sched domain, - * as determined by the single cpumask fallback_doms. - */ -static cpumask_var_t fallback_doms; - -/* - * arch_update_cpu_topology lets virtualized architectures update the - * cpu core maps. It is supposed to return 1 if the topology changed - * or 0 if it stayed the same. - */ -int __weak arch_update_cpu_topology(void) -{ - return 0; -} - -cpumask_var_t *alloc_sched_domains(unsigned int ndoms) -{ - int i; - cpumask_var_t *doms; - - doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); - if (!doms) - return NULL; - for (i = 0; i < ndoms; i++) { - if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { - free_sched_domains(doms, i); - return NULL; - } - } - return doms; -} - -void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) -{ - unsigned int i; - for (i = 0; i < ndoms; i++) - free_cpumask_var(doms[i]); - kfree(doms); -} - -/* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ -static int init_sched_domains(const struct cpumask *cpu_map) -{ - int err; - - arch_update_cpu_topology(); - ndoms_cur = 1; - doms_cur = alloc_sched_domains(ndoms_cur); - if (!doms_cur) - doms_cur = &fallback_doms; - cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); - err = build_sched_domains(doms_cur[0], NULL); - register_sched_domain_sysctl(); - - return err; -} - -/* - * Detach sched domains from a group of cpus specified in cpu_map - * These cpus will now be attached to the NULL domain - */ -static void detach_destroy_domains(const struct cpumask *cpu_map) -{ - int i; - - rcu_read_lock(); - for_each_cpu(i, cpu_map) - cpu_attach_domain(NULL, &def_root_domain, i); - rcu_read_unlock(); -} - -/* handle null as "default" */ -static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, - struct sched_domain_attr *new, int idx_new) -{ - struct sched_domain_attr tmp; - - /* fast path */ - if (!new && !cur) - return 1; - - tmp = SD_ATTR_INIT; - return !memcmp(cur ? (cur + idx_cur) : &tmp, - new ? (new + idx_new) : &tmp, - sizeof(struct sched_domain_attr)); -} - -/* - * Partition sched domains as specified by the 'ndoms_new' - * cpumasks in the array doms_new[] of cpumasks. This compares - * doms_new[] to the current sched domain partitioning, doms_cur[]. - * It destroys each deleted domain and builds each new domain. - * - * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. - * The masks don't intersect (don't overlap.) We should setup one - * sched domain for each mask. CPUs not in any of the cpumasks will - * not be load balanced. If the same cpumask appears both in the - * current 'doms_cur' domains and in the new 'doms_new', we can leave - * it as it is. - * - * The passed in 'doms_new' should be allocated using - * alloc_sched_domains. This routine takes ownership of it and will - * free_sched_domains it when done with it. If the caller failed the - * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, - * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms', it also forces the domains to be rebuilt. - * - * If doms_new == NULL it will be replaced with cpu_online_mask. - * ndoms_new == 0 is a special case for destroying existing domains, - * and it will not create the default domain. - * - * Call with hotplug lock held - */ -void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ - int i, j, n; - int new_topology; - - mutex_lock(&sched_domains_mutex); - - /* always unregister in case we don't destroy any domains */ - unregister_sched_domain_sysctl(); - - /* Let architecture update cpu core mappings. */ - new_topology = arch_update_cpu_topology(); - - n = doms_new ? ndoms_new : 0; - - /* Destroy deleted domains */ - for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_cur[i], doms_new[j]) - && dattrs_equal(dattr_cur, i, dattr_new, j)) - goto match1; - } - /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur[i]); -match1: - ; - } - - n = ndoms_cur; - if (doms_new == NULL) { - n = 0; - doms_new = &fallback_doms; - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); - WARN_ON_ONCE(dattr_new); - } - - /* Build new domains */ - for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_new[i], doms_cur[j]) - && dattrs_equal(dattr_new, i, dattr_cur, j)) - goto match2; - } - /* no match - add a new doms_new */ - build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); -match2: - ; - } - - /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) - free_sched_domains(doms_cur, ndoms_cur); - kfree(dattr_cur); /* kfree(NULL) is safe */ - doms_cur = doms_new; - dattr_cur = dattr_new; - ndoms_cur = ndoms_new; - - register_sched_domain_sysctl(); - - mutex_unlock(&sched_domains_mutex); -} - -static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ +static int num_cpus_frozen; /* * Update cpusets according to cpu_active mask. If cpusets are @@ -7305,7 +5775,7 @@ static void cpuset_cpu_active(void) * cpuset configurations. */ } - cpuset_update_active_cpus(true); + cpuset_update_active_cpus(); } static int cpuset_cpu_inactive(unsigned int cpu) @@ -7328,7 +5798,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) if (overflow) return -EBUSY; - cpuset_update_active_cpus(false); + cpuset_update_active_cpus(); } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); @@ -7339,7 +5809,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; set_cpu_active(cpu, true); @@ -7352,17 +5822,17 @@ int sched_cpu_activate(unsigned int cpu) * Put the rq online, if not already. This happens: * * 1) In the early boot process, because we build the real domains - * after all cpus have been brought up. + * after all CPUs have been brought up. * * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); update_max_interval(); @@ -7420,18 +5890,20 @@ int sched_cpu_starting(unsigned int cpu) int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ sched_ttwu_pending(); - raw_spin_lock_irqsave(&rq->lock, flags); + + rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, &rf); BUG_ON(rq->nr_running != 1); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); + calc_load_migrate(rq); update_max_interval(); nohz_balance_exit_idle(cpu); @@ -7467,7 +5939,7 @@ void __init sched_init_smp(void) /* * There's no userspace yet to cause hotplug operations; hence all the - * cpu masks are stable and all blatant races in the below code cannot + * CPU masks are stable and all blatant races in the below code cannot * happen. */ mutex_lock(&sched_domains_mutex); @@ -7487,6 +5959,7 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_init_smt(); + sched_clock_init_late(); sched_smp_initialized = true; } @@ -7502,6 +5975,7 @@ early_initcall(migration_init); void __init sched_init_smp(void) { sched_init_granularity(); + sched_clock_init_late(); } #endif /* CONFIG_SMP */ @@ -7545,6 +6019,8 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; + sched_clock_init(); + for (i = 0; i < WAIT_TABLE_SIZE; i++) init_waitqueue_head(bit_wait_table + i); @@ -7583,10 +6059,8 @@ void __init sched_init(void) } #endif /* CONFIG_CPUMASK_OFFSTACK */ - init_rt_bandwidth(&def_rt_bandwidth, - global_rt_period(), global_rt_runtime()); - init_dl_bandwidth(&def_dl_bandwidth, - global_rt_period(), global_rt_runtime()); + init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); + init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); #ifdef CONFIG_SMP init_defrootdomain(); @@ -7622,18 +6096,18 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* - * How much cpu bandwidth does root_task_group get? + * How much CPU bandwidth does root_task_group get? * * In case of task-groups formed thr' the cgroup filesystem, it - * gets 100% of the cpu resources in the system. This overall - * system cpu resource is divided among the tasks of + * gets 100% of the CPU resources in the system. This overall + * system CPU resource is divided among the tasks of * root_task_group and its child task-groups in a fair manner, * based on each entity's (task or task-group's) weight * (se->load.weight). * * In other words, if root_task_group has 10 tasks of weight * 1024) and two child groups A0 and A1 (of weight 1024 each), - * then A0's share of the cpu resource is: + * then A0's share of the CPU resource is: * * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% * @@ -7686,7 +6160,7 @@ void __init sched_init(void) /* * The boot idle thread does lazy MMU switching as well: */ - atomic_inc(&init_mm.mm_count); + mmgrab(&init_mm); enter_lazy_tlb(&init_mm, current); /* @@ -7742,10 +6216,14 @@ EXPORT_SYMBOL(__might_sleep); void ___might_sleep(const char *file, int line, int preempt_offset) { - static unsigned long prev_jiffy; /* ratelimiting */ + /* Ratelimiting timestamp: */ + static unsigned long prev_jiffy; + unsigned long preempt_disable_ip; - rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ + /* WARN_ON_ONCE() by default, no rate limit required: */ + rcu_sleep_check(); + if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && !is_idle_task(current)) || system_state != SYSTEM_RUNNING || oops_in_progress) @@ -7754,7 +6232,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; - /* Save this before calling printk(), since that will clobber it */ + /* Save this before calling printk(), since that will clobber it: */ preempt_disable_ip = get_preempt_disable_ip(current); printk(KERN_ERR @@ -7833,7 +6311,7 @@ void normalize_rt_tasks(void) */ /** - * curr_task - return the current task for a given cpu. + * curr_task - return the current task for a given CPU. * @cpu: the processor in question. * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! @@ -7849,13 +6327,13 @@ struct task_struct *curr_task(int cpu) #ifdef CONFIG_IA64 /** - * set_curr_task - set the current task for a given cpu. + * set_curr_task - set the current task for a given CPU. * @cpu: the processor in question. * @p: the task pointer to set. * * Description: This function must only be used when non-maskable interrupts * are serviced on a separate stack. It allows the architecture to switch the - * notion of the current task on a cpu in a non-blocking manner. This function + * notion of the current task on a CPU in a non-blocking manner. This function * must be called with all CPU's synchronized, and interrupts disabled, the * and caller must save the original value of the current task (see * curr_task() above) and restore that value before reenabling interrupts and @@ -7911,7 +6389,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) spin_lock_irqsave(&task_group_lock, flags); list_add_rcu(&tg->list, &task_groups); - WARN_ON(!parent); /* root should already exist */ + /* Root should already exist: */ + WARN_ON(!parent); tg->parent = parent; INIT_LIST_HEAD(&tg->children); @@ -7924,13 +6403,13 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) /* rcu callback to free various structures associated with a task group */ static void sched_free_group_rcu(struct rcu_head *rhp) { - /* now it should be safe to free those cfs_rqs */ + /* Now it should be safe to free those cfs_rqs: */ sched_free_group(container_of(rhp, struct task_group, rcu)); } void sched_destroy_group(struct task_group *tg) { - /* wait for possible concurrent references to cfs_rqs complete */ + /* Wait for possible concurrent references to cfs_rqs complete: */ call_rcu(&tg->rcu, sched_free_group_rcu); } @@ -7938,7 +6417,7 @@ void sched_offline_group(struct task_group *tg) { unsigned long flags; - /* end participation in shares distribution */ + /* End participation in shares distribution: */ unregister_fair_sched_group(tg); spin_lock_irqsave(&task_group_lock, flags); @@ -7978,25 +6457,27 @@ static void sched_change_group(struct task_struct *tsk, int type) */ void sched_move_task(struct task_struct *tsk) { - int queued, running; + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq_flags rf; struct rq *rq; rq = task_rq_lock(tsk, &rf); + update_rq_clock(rq); running = task_current(rq, tsk); queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); - if (unlikely(running)) + dequeue_task(rq, tsk, queue_flags); + if (running) put_prev_task(rq, tsk); sched_change_group(tsk, TASK_MOVE_GROUP); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); - if (unlikely(running)) + enqueue_task(rq, tsk, queue_flags); + if (running) set_curr_task(rq, tsk); task_rq_unlock(rq, tsk, &rf); @@ -8366,11 +6847,14 @@ int sched_rr_handler(struct ctl_table *table, int write, mutex_lock(&mutex); ret = proc_dointvec(table, write, buffer, lenp, ppos); - /* make sure that internally we keep jiffies */ - /* also, writing zero resets timeslice to default */ + /* + * Make sure that internally we keep jiffies. + * Also, writing zero resets the timeslice to default: + */ if (!ret && write) { - sched_rr_timeslice = sched_rr_timeslice <= 0 ? - RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + sched_rr_timeslice = + sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : + msecs_to_jiffies(sysctl_sched_rr_timeslice); } mutex_unlock(&mutex); return ret; @@ -8398,11 +6882,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - sched_online_group(tg, parent); - return &tg->css; } +/* Expose task group only after completing cgroup initialization */ +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css->parent); + + if (parent) + sched_online_group(tg, parent); + return 0; +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -8431,6 +6924,7 @@ static void cpu_cgroup_fork(struct task_struct *task) rq = task_rq_lock(task, &rf); + update_rq_clock(rq); sched_change_group(task, TASK_SET_GROUP); task_rq_unlock(rq, task, &rf); @@ -8550,22 +7044,25 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) cfs_b->quota = quota; __refill_cfs_bandwidth_runtime(cfs_b); - /* restart the period timer (if active) to handle new period expiry */ + + /* Restart the period timer (if active) to handle new period expiry: */ if (runtime_enabled) start_cfs_bandwidth(cfs_b); + raw_spin_unlock_irq(&cfs_b->lock); for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; + struct rq_flags rf; - raw_spin_lock_irq(&rq->lock); + rq_lock_irq(rq, &rf); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); @@ -8690,8 +7187,8 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) parent_quota = parent_b->hierarchical_quota; /* - * ensure max(child_quota) <= parent_quota, inherit when no - * limit is set + * Ensure max(child_quota) <= parent_quota, inherit when no + * limit is set: */ if (quota == RUNTIME_INF) quota = parent_quota; @@ -8800,11 +7297,12 @@ static struct cftype cpu_files[] = { .write_u64 = cpu_rt_period_write_uint, }, #endif - { } /* terminate */ + { } /* Terminate */ }; struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .fork = cpu_cgroup_fork, diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9add206b5608..f95ab29a45d0 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[stat], - (long long)cputime64_to_clock_t(val[stat])); + (long long)nsec_to_clock_t(val[stat])); } return 0; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index e73119013c53..fba235c7d026 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -128,10 +128,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) { + cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { best_cpu = cpumask_any(later_mask); goto out; - } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) && + } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { best_cpu = cpudl_maximum(cp); if (later_mask) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index fd4659313640..76877a62b5fa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -13,6 +13,7 @@ #include <linux/cpufreq.h> #include <linux/kthread.h> +#include <uapi/linux/sched/types.h> #include <linux/slab.h> #include <trace/events/power.h> @@ -35,6 +36,7 @@ struct sugov_policy { u64 last_freq_update_time; s64 freq_update_delay_ns; unsigned int next_freq; + unsigned int cached_raw_freq; /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; @@ -51,7 +53,6 @@ struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; - unsigned int cached_raw_freq; unsigned long iowait_boost; unsigned long iowait_boost_max; u64 last_update; @@ -60,6 +61,11 @@ struct sugov_cpu { unsigned long util; unsigned long max; unsigned int flags; + + /* The field below is for single-CPU policies only. */ +#ifdef CONFIG_NO_HZ_COMMON + unsigned long saved_idle_calls; +#endif }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); @@ -92,22 +98,23 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, { struct cpufreq_policy *policy = sg_policy->policy; + if (sg_policy->next_freq == next_freq) + return; + + if (sg_policy->next_freq > next_freq) + next_freq = (sg_policy->next_freq + next_freq) >> 1; + + sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; if (policy->fast_switch_enabled) { - if (sg_policy->next_freq == next_freq) { - trace_cpu_frequency(policy->cur, smp_processor_id()); - return; - } - sg_policy->next_freq = next_freq; next_freq = cpufreq_driver_fast_switch(policy, next_freq); if (next_freq == CPUFREQ_ENTRY_INVALID) return; policy->cur = next_freq; trace_cpu_frequency(next_freq, smp_processor_id()); - } else if (sg_policy->next_freq != next_freq) { - sg_policy->next_freq = next_freq; + } else { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } @@ -115,7 +122,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, /** * get_next_freq - Compute a new frequency for a given cpufreq policy. - * @sg_cpu: schedutil cpu object to compute the new frequency for. + * @sg_policy: schedutil policy object to compute the new frequency for. * @util: Current CPU utilization. * @max: CPU capacity. * @@ -135,19 +142,18 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, * next_freq (as calculated above) is returned, subject to policy min/max and * cpufreq driver limitations. */ -static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, - unsigned long max) +static unsigned int get_next_freq(struct sugov_policy *sg_policy, + unsigned long util, unsigned long max) { - struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; freq = (freq + (freq >> 2)) * util / max; - if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) return sg_policy->next_freq; - sg_cpu->cached_raw_freq = freq; + sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -192,6 +198,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, sg_cpu->iowait_boost >>= 1; } +#ifdef CONFIG_NO_HZ_COMMON +static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +{ + unsigned long idle_calls = tick_nohz_get_idle_calls(); + bool ret = idle_calls == sg_cpu->saved_idle_calls; + + sg_cpu->saved_idle_calls = idle_calls; + return ret; +} +#else +static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } +#endif /* CONFIG_NO_HZ_COMMON */ + static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int flags) { @@ -200,6 +219,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, struct cpufreq_policy *policy = sg_policy->policy; unsigned long util, max; unsigned int next_f; + bool busy; sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -207,40 +227,37 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; + busy = sugov_cpu_is_busy(sg_cpu); + if (flags & SCHED_CPUFREQ_RT_DL) { next_f = policy->cpuinfo.max_freq; } else { sugov_get_util(&util, &max); sugov_iowait_boost(sg_cpu, &util, &max); - next_f = get_next_freq(sg_cpu, util, max); + next_f = get_next_freq(sg_policy, util, max); + /* + * Do not reduce the frequency if the CPU has not been idle + * recently, as the reduction is likely to be premature then. + */ + if (busy && next_f < sg_policy->next_freq) + next_f = sg_policy->next_freq; } sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, - unsigned long util, unsigned long max, - unsigned int flags) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned int max_f = policy->cpuinfo.max_freq; u64 last_freq_update_time = sg_policy->last_freq_update_time; + unsigned long util = 0, max = 1; unsigned int j; - if (flags & SCHED_CPUFREQ_RT_DL) - return max_f; - - sugov_iowait_boost(sg_cpu, &util, &max); - for_each_cpu(j, policy->cpus) { - struct sugov_cpu *j_sg_cpu; + struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; s64 delta_ns; - if (j == smp_processor_id()) - continue; - - j_sg_cpu = &per_cpu(sugov_cpu, j); /* * If the CPU utilization was last updated before the previous * frequency update and the time elapsed between the last update @@ -254,7 +271,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) - return max_f; + return policy->cpuinfo.max_freq; j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; @@ -266,7 +283,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, sugov_iowait_boost(j_sg_cpu, &util, &max); } - return get_next_freq(sg_cpu, util, max); + return get_next_freq(sg_policy, util, max); } static void sugov_update_shared(struct update_util_data *hook, u64 time, @@ -289,7 +306,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); + if (flags & SCHED_CPUFREQ_RT_DL) + next_f = sg_policy->policy->cpuinfo.max_freq; + else + next_f = sugov_next_freq_shared(sg_cpu); + sugov_update_commit(sg_policy, time, next_f); } @@ -473,7 +494,6 @@ static int sugov_init(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy; struct sugov_tunables *tunables; - unsigned int lat; int ret = 0; /* State should be equivalent to EXIT */ @@ -512,10 +532,16 @@ static int sugov_init(struct cpufreq_policy *policy) goto stop_kthread; } - tunables->rate_limit_us = LATENCY_MULTIPLIER; - lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; - if (lat) - tunables->rate_limit_us *= lat; + if (policy->transition_delay_us) { + tunables->rate_limit_us = policy->transition_delay_us; + } else { + unsigned int lat; + + tunables->rate_limit_us = LATENCY_MULTIPLIER; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) + tunables->rate_limit_us *= lat; + } policy->governor_data = sg_policy; sg_policy->tunables = tunables; @@ -579,25 +605,19 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->next_freq = UINT_MAX; sg_policy->work_in_progress = false; sg_policy->need_freq_update = false; + sg_policy->cached_raw_freq = 0; for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->sg_policy = sg_policy; - if (policy_is_shared(policy)) { - sg_cpu->util = 0; - sg_cpu->max = 0; - sg_cpu->flags = SCHED_CPUFREQ_RT; - sg_cpu->last_update = 0; - sg_cpu->cached_raw_freq = 0; - sg_cpu->iowait_boost = 0; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - sugov_update_shared); - } else { - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - sugov_update_single); - } + sg_cpu->flags = SCHED_CPUFREQ_RT; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + policy_is_shared(policy) ? + sugov_update_shared : + sugov_update_single); } return 0; } diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 11e9705bf937..981fcd7dc394 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) continue; - if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; if (lowest_mask) { - cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask); + cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7700a9cba335..aea3135c5d90 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -4,11 +4,8 @@ #include <linux/kernel_stat.h> #include <linux/static_key.h> #include <linux/context_tracking.h> +#include <linux/sched/cputime.h> #include "sched.h" -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#endif - #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -37,6 +34,18 @@ void disable_sched_clock_irqtime(void) sched_clock_irqtime = 0; } +static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, + enum cpu_usage_stat idx) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + u64_stats_update_begin(&irqtime->sync); + cpustat[idx] += delta; + irqtime->total += delta; + irqtime->tick_delta += delta; + u64_stats_update_end(&irqtime->sync); +} + /* * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. @@ -54,7 +63,6 @@ void irqtime_account_irq(struct task_struct *curr) delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; - u64_stats_update_begin(&irqtime->sync); /* * We do not account for softirq time from ksoftirqd here. * We want to continue accounting softirq time to ksoftirqd thread @@ -62,48 +70,28 @@ void irqtime_account_irq(struct task_struct *curr) * that do not consume any time, but still wants to run. */ if (hardirq_count()) - irqtime->hardirq_time += delta; + irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - irqtime->softirq_time += delta; - - u64_stats_update_end(&irqtime->sync); + irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime) +static u64 irqtime_tick_accounted(u64 maxtime) { - u64 *cpustat = kcpustat_this_cpu->cpustat; - cputime_t irq_cputime; - - irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx]; - irq_cputime = min(irq_cputime, maxtime); - cpustat[idx] += irq_cputime; - - return irq_cputime; -} + struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); + u64 delta; -static cputime_t irqtime_account_hi_update(cputime_t maxtime) -{ - return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time), - CPUTIME_IRQ, maxtime); -} + delta = min(irqtime->tick_delta, maxtime); + irqtime->tick_delta -= delta; -static cputime_t irqtime_account_si_update(cputime_t maxtime) -{ - return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time), - CPUTIME_SOFTIRQ, maxtime); + return delta; } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ #define sched_clock_irqtime (0) -static cputime_t irqtime_account_hi_update(cputime_t dummy) -{ - return 0; -} - -static cputime_t irqtime_account_si_update(cputime_t dummy) +static u64 irqtime_tick_accounted(u64 dummy) { return 0; } @@ -129,7 +117,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update */ -void account_user_time(struct task_struct *p, cputime_t cputime) +void account_user_time(struct task_struct *p, u64 cputime) { int index; @@ -140,7 +128,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; /* Add user time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); + task_group_account_field(p, index, cputime); /* Account for user time used */ acct_account_cputime(p); @@ -151,7 +139,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update */ -static void account_guest_time(struct task_struct *p, cputime_t cputime) +void account_guest_time(struct task_struct *p, u64 cputime) { u64 *cpustat = kcpustat_this_cpu->cpustat; @@ -162,11 +150,11 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) /* Add guest time to cpustat. */ if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64) cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; } else { - cpustat[CPUTIME_USER] += (__force u64) cputime; - cpustat[CPUTIME_GUEST] += (__force u64) cputime; + cpustat[CPUTIME_USER] += cputime; + cpustat[CPUTIME_GUEST] += cputime; } } @@ -176,15 +164,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) * @cputime: the cpu time spent in kernel space since the last update * @index: pointer to cpustat field that has to be updated */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, int index) +void account_system_index_time(struct task_struct *p, + u64 cputime, enum cpu_usage_stat index) { /* Add system time to process. */ p->stime += cputime; account_group_system_time(p, cputime); /* Add system time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); + task_group_account_field(p, index, cputime); /* Account for system time used */ acct_account_cputime(p); @@ -196,8 +184,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, int index) * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime) +void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) { int index; @@ -213,33 +200,33 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else index = CPUTIME_SYSTEM; - __account_system_time(p, cputime, index); + account_system_index_time(p, cputime, index); } /* * Account for involuntary wait time. * @cputime: the cpu time spent in involuntary wait */ -void account_steal_time(cputime_t cputime) +void account_steal_time(u64 cputime) { u64 *cpustat = kcpustat_this_cpu->cpustat; - cpustat[CPUTIME_STEAL] += (__force u64) cputime; + cpustat[CPUTIME_STEAL] += cputime; } /* * Account for idle time. * @cputime: the cpu time spent in idle wait */ -void account_idle_time(cputime_t cputime) +void account_idle_time(u64 cputime) { u64 *cpustat = kcpustat_this_cpu->cpustat; struct rq *rq = this_rq(); if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; + cpustat[CPUTIME_IOWAIT] += cputime; else - cpustat[CPUTIME_IDLE] += (__force u64) cputime; + cpustat[CPUTIME_IDLE] += cputime; } /* @@ -247,21 +234,19 @@ void account_idle_time(cputime_t cputime) * ticks are not redelivered later. Due to that, this function may on * occasion account more time than the calling functions think elapsed. */ -static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) +static __always_inline u64 steal_account_process_time(u64 maxtime) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { - cputime_t steal_cputime; u64 steal; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; + steal = min(steal, maxtime); + account_steal_time(steal); + this_rq()->prev_steal_time += steal; - steal_cputime = min(nsecs_to_cputime(steal), maxtime); - account_steal_time(steal_cputime); - this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); - - return steal_cputime; + return steal; } #endif return 0; @@ -270,9 +255,9 @@ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) /* * Account how much elapsed time was spent in steal, irq, or softirq time. */ -static inline cputime_t account_other_time(cputime_t max) +static inline u64 account_other_time(u64 max) { - cputime_t accounted; + u64 accounted; /* Shall be converted to a lockdep-enabled lightweight check */ WARN_ON_ONCE(!irqs_disabled()); @@ -280,10 +265,7 @@ static inline cputime_t account_other_time(cputime_t max) accounted = steal_account_process_time(max); if (accounted < max) - accounted += irqtime_account_hi_update(max - accounted); - - if (accounted < max) - accounted += irqtime_account_si_update(max - accounted); + accounted += irqtime_tick_accounted(max - accounted); return accounted; } @@ -315,7 +297,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { struct signal_struct *sig = tsk->signal; - cputime_t utime, stime; + u64 utime, stime; struct task_struct *t; unsigned int seq, nextseq; unsigned long flags; @@ -379,8 +361,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq, int ticks) { - u64 cputime = (__force u64) cputime_one_jiffy * ticks; - cputime_t other; + u64 other, cputime = TICK_NSEC * ticks; /* * When returning from idle, many ticks can get accounted at @@ -392,6 +373,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, other = account_other_time(ULONG_MAX); if (other >= cputime) return; + cputime -= other; if (this_cpu_ksoftirqd() == p) { @@ -400,7 +382,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime, CPUTIME_SOFTIRQ); + account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { account_user_time(p, cputime); } else if (p == rq->idle) { @@ -408,7 +390,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, } else if (p->flags & PF_VCPU) { /* System time or guest time */ account_guest_time(p, cputime); } else { - __account_system_time(p, cputime, CPUTIME_SYSTEM); + account_system_index_time(p, cputime, CPUTIME_SYSTEM); } } @@ -437,9 +419,7 @@ void vtime_common_task_switch(struct task_struct *prev) else vtime_account_system(prev); -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - vtime_account_user(prev); -#endif + vtime_flush(prev); arch_vtime_task_switch(prev); } #endif @@ -467,14 +447,14 @@ void vtime_account_irq_enter(struct task_struct *tsk) EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) { *ut = p->utime; *st = p->stime; } EXPORT_SYMBOL_GPL(task_cputime_adjusted); -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) { struct task_cputime cputime; @@ -491,7 +471,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t cputime, steal; + u64 cputime, steal; struct rq *rq = this_rq(); if (vtime_accounting_cpu_enabled()) @@ -502,7 +482,7 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } - cputime = cputime_one_jiffy; + cputime = TICK_NSEC; steal = steal_account_process_time(ULONG_MAX); if (steal >= cputime) @@ -524,14 +504,14 @@ void account_process_tick(struct task_struct *p, int user_tick) */ void account_idle_ticks(unsigned long ticks) { - cputime_t cputime, steal; + u64 cputime, steal; if (sched_clock_irqtime) { irqtime_account_idle_ticks(ticks); return; } - cputime = jiffies_to_cputime(ticks); + cputime = ticks * TICK_NSEC; steal = steal_account_process_time(ULONG_MAX); if (steal >= cputime) @@ -545,7 +525,7 @@ void account_idle_ticks(unsigned long ticks) * Perform (stime * rtime) / total, but avoid multiplication overflow by * loosing precision when the numbers are big. */ -static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) +static u64 scale_stime(u64 stime, u64 rtime, u64 total) { u64 scaled; @@ -582,7 +562,7 @@ drop_precision: * followed by a 64/32->64 divide. */ scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); - return (__force cputime_t) scaled; + return scaled; } /* @@ -607,14 +587,14 @@ drop_precision: */ static void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - cputime_t *ut, cputime_t *st) + u64 *ut, u64 *st) { - cputime_t rtime, stime, utime; + u64 rtime, stime, utime; unsigned long flags; /* Serialize concurrent callers such that we can honour our guarantees */ raw_spin_lock_irqsave(&prev->lock, flags); - rtime = nsecs_to_cputime(curr->sum_exec_runtime); + rtime = curr->sum_exec_runtime; /* * This is possible under two circumstances: @@ -645,8 +625,7 @@ static void cputime_adjust(struct task_cputime *curr, goto update; } - stime = scale_stime((__force u64)stime, (__force u64)rtime, - (__force u64)(stime + utime)); + stime = scale_stime(stime, rtime, stime + utime); update: /* @@ -679,7 +658,7 @@ out: raw_spin_unlock_irqrestore(&prev->lock, flags); } -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) { struct task_cputime cputime = { .sum_exec_runtime = p->se.sum_exec_runtime, @@ -690,7 +669,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) } EXPORT_SYMBOL_GPL(task_cputime_adjusted); -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) { struct task_cputime cputime; @@ -700,20 +679,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static cputime_t vtime_delta(struct task_struct *tsk) +static u64 vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); if (time_before(now, (unsigned long)tsk->vtime_snap)) return 0; - return jiffies_to_cputime(now - tsk->vtime_snap); + return jiffies_to_nsecs(now - tsk->vtime_snap); } -static cputime_t get_vtime_delta(struct task_struct *tsk) +static u64 get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - cputime_t delta, other; + u64 delta, other; /* * Unlike tick based timing, vtime based timing never has lost @@ -722,7 +701,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_cputime(now - tsk->vtime_snap); + delta = jiffies_to_nsecs(now - tsk->vtime_snap); other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; @@ -732,9 +711,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) static void __vtime_account_system(struct task_struct *tsk) { - cputime_t delta_cpu = get_vtime_delta(tsk); - - account_system_time(tsk, irq_count(), delta_cpu); + account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); } void vtime_account_system(struct task_struct *tsk) @@ -749,14 +726,10 @@ void vtime_account_system(struct task_struct *tsk) void vtime_account_user(struct task_struct *tsk) { - cputime_t delta_cpu; - write_seqcount_begin(&tsk->vtime_seqcount); tsk->vtime_snap_whence = VTIME_SYS; - if (vtime_delta(tsk)) { - delta_cpu = get_vtime_delta(tsk); - account_user_time(tsk, delta_cpu); - } + if (vtime_delta(tsk)) + account_user_time(tsk, get_vtime_delta(tsk)); write_seqcount_end(&tsk->vtime_seqcount); } @@ -797,9 +770,7 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { - cputime_t delta_cpu = get_vtime_delta(tsk); - - account_idle_time(delta_cpu); + account_idle_time(get_vtime_delta(tsk)); } void arch_vtime_task_switch(struct task_struct *prev) @@ -826,10 +797,10 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_restore(flags); } -cputime_t task_gtime(struct task_struct *t) +u64 task_gtime(struct task_struct *t) { unsigned int seq; - cputime_t gtime; + u64 gtime; if (!vtime_accounting_enabled()) return t->gtime; @@ -851,9 +822,9 @@ cputime_t task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) +void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { - cputime_t delta; + u64 delta; unsigned int seq; if (!vtime_accounting_enabled()) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 70ef2b1901e4..a2ce59015642 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (tsk_nr_cpus_allowed(p) > 1) + if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; update_dl_migration(dl_rq); @@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (tsk_nr_cpus_allowed(p) > 1) + if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; update_dl_migration(dl_rq); @@ -252,7 +252,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * If we cannot preempt any rq, fall back to pick any * online cpu. */ - cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); + cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); if (cpu >= nr_cpu_ids) { /* * Fail to find any suitable cpu. @@ -445,13 +445,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * * This function returns true if: * - * runtime / (deadline - t) > dl_runtime / dl_period , + * runtime / (deadline - t) > dl_runtime / dl_deadline , * * IOW we can't recycle current parameters. * - * Notice that the bandwidth check is done against the period. For + * Notice that the bandwidth check is done against the deadline. For * task with deadline equal to period this is the same of using - * dl_deadline instead of dl_period in the equation above. + * dl_period instead of dl_deadline in the equation above. */ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se, u64 t) @@ -476,7 +476,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * (pi_se->dl_runtime >> DL_SCALE); @@ -505,10 +505,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, } } +static inline u64 dl_next_period(struct sched_dl_entity *dl_se) +{ + return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period; +} + /* * If the entity depleted all its runtime, and if we want it to sleep * while waiting for some new execution time to become available, we - * set the bandwidth enforcement timer to the replenishment instant + * set the bandwidth replenishment timer to the replenishment instant * and try to activate it. * * Notice that it is important for the caller to know if the timer @@ -530,7 +535,7 @@ static int start_dl_timer(struct task_struct *p) * that it is actually coming from rq->clock and not from * hrtimer's time base reading. */ - act = ns_to_ktime(dl_se->deadline); + act = ns_to_ktime(dl_next_period(dl_se)); now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -638,6 +643,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); rf.cookie = lockdep_pin_lock(&rq->lock); + update_rq_clock(rq); /* * Now that the task has been migrated to the new RQ and we @@ -663,9 +669,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * Nothing relies on rq->lock after this, so its safe to drop * rq->lock. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + rq_unpin_lock(rq, &rf); push_dl_task(rq); - lockdep_repin_lock(&rq->lock, rf.cookie); + rq_repin_lock(rq, &rf); } #endif @@ -689,6 +695,37 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) timer->function = dl_task_timer; } +/* + * During the activation, CBS checks if it can reuse the current task's + * runtime and period. If the deadline of the task is in the past, CBS + * cannot use the runtime, and so it replenishes the task. This rule + * works fine for implicit deadline tasks (deadline == period), and the + * CBS was designed for implicit deadline tasks. However, a task with + * constrained deadline (deadine < period) might be awakened after the + * deadline, but before the next period. In this case, replenishing the + * task would allow it to run for runtime / deadline. As in this case + * deadline < period, CBS enables a task to run for more than the + * runtime / period. In a very loaded system, this can cause a domino + * effect, making other tasks miss their deadlines. + * + * To avoid this problem, in the activation of a constrained deadline + * task after the deadline but before the next period, throttle the + * task and set the replenishing timer to the begin of the next period, + * unless it is boosted. + */ +static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) +{ + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); + + if (dl_time_before(dl_se->deadline, rq_clock(rq)) && + dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { + if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + return; + dl_se->dl_throttled = 1; + } +} + static int dl_runtime_exceeded(struct sched_dl_entity *dl_se) { @@ -922,6 +959,11 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) __dequeue_dl_entity(dl_se); } +static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline < dl_se->dl_period; +} + static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *pi_task = rt_mutex_get_top_task(p); @@ -948,6 +990,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) } /* + * Check if a constrained deadline task was activated + * after the deadline but before the next period. + * If that is the case, the task will be throttled and + * the replenishment timer will be set to the next period. + */ + if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) + dl_check_constrained_dl(&p->dl); + + /* * If p is throttled, we do nothing. In fact, if it exhausted * its budget it needs a replenishment and, since it now is on * its rq, the bandwidth timer callback (which clearly has not @@ -958,7 +1009,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) enqueue_dl_entity(&p->dl, pi_se, flags); - if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -1032,9 +1083,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * try to make it stay here, it might be important. */ if (unlikely(dl_task(curr)) && - (tsk_nr_cpus_allowed(curr) < 2 || + (curr->nr_cpus_allowed < 2 || !dl_entity_preempt(&p->dl, &curr->dl)) && - (tsk_nr_cpus_allowed(p) > 1)) { + (p->nr_cpus_allowed > 1)) { int target = find_later_rq(p); if (target != -1 && @@ -1055,7 +1106,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (tsk_nr_cpus_allowed(rq->curr) == 1 || + if (rq->curr->nr_cpus_allowed == 1 || cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) return; @@ -1063,7 +1114,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (tsk_nr_cpus_allowed(p) != 1 && + if (p->nr_cpus_allowed != 1 && cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; @@ -1118,7 +1169,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, } struct task_struct * -pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_dl_entity *dl_se; struct task_struct *p; @@ -1133,9 +1184,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); pull_dl_task(rq); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); /* * pull_dl_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to @@ -1178,7 +1229,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); - if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1) + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -1235,7 +1286,7 @@ static void set_curr_task_dl(struct rq *rq) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + cpumask_test_cpu(cpu, &p->cpus_allowed)) return 1; return 0; } @@ -1279,7 +1330,7 @@ static int find_later_rq(struct task_struct *task) if (unlikely(!later_mask)) return -1; - if (tsk_nr_cpus_allowed(task) == 1) + if (task->nr_cpus_allowed == 1) return -1; /* @@ -1384,8 +1435,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(later_rq->cpu, - tsk_cpus_allowed(task)) || + !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { @@ -1425,7 +1475,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(tsk_nr_cpus_allowed(p) <= 1); + BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); @@ -1464,7 +1514,7 @@ retry: */ if (dl_task(rq->curr) && dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && - tsk_nr_cpus_allowed(rq->curr) > 1) { + rq->curr->nr_cpus_allowed > 1) { resched_curr(rq); return 0; } @@ -1611,9 +1661,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - tsk_nr_cpus_allowed(p) > 1 && + p->nr_cpus_allowed > 1 && dl_task(rq->curr) && - (tsk_nr_cpus_allowed(rq->curr) < 2 || + (rq->curr->nr_cpus_allowed < 2 || !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } @@ -1727,14 +1777,13 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (rq->curr != p) { #ifdef CONFIG_SMP - if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) + if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) queue_push_tasks(rq); -#else +#endif if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); -#endif } } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index fa178b62ea79..38f019324f1a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -11,7 +11,8 @@ */ #include <linux/proc_fs.h> -#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> #include <linux/seq_file.h> #include <linux/kallsyms.h> #include <linux/utsname.h> @@ -953,6 +954,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #endif P(policy); P(prio); + if (p->policy == SCHED_DEADLINE) { + P(dl.runtime); + P(dl.deadline); + } #undef PN_SCHEDSTAT #undef PN #undef __PN diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6559d197e08a..d71109321841 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,7 +20,9 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/topology.h> + #include <linux/latencytop.h> #include <linux/cpumask.h> #include <linux/cpuidle.h> @@ -715,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP + +#include "sched-pelt.h" + static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are - * dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ - /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) { @@ -1551,7 +1547,7 @@ static void task_numa_compare(struct task_numa_env *env, */ if (cur) { /* Skip this swap candidate if cannot move to the source cpu */ - if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) + if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) goto unlock; /* @@ -1661,7 +1657,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) + if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) continue; env->dst_cpu = cpu; @@ -2657,6 +2653,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) if (tg_weight) shares /= tg_weight; + /* + * MIN_SHARES has to be unscaled here to support per-CPU partitioning + * of a group with small tg->shares value. It is a floor value which is + * assigned as a minimum load.weight to the sched_entity representing + * the group on a CPU. + * + * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024 + * on an 8-core system with 8 tasks each runnable on one CPU shares has + * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In + * case no task is runnable on a CPU MIN_SHARES=2 should be returned + * instead of 0. + */ if (shares < MIN_SHARES) shares = MIN_SHARES; if (shares > tg->shares) @@ -2689,16 +2697,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct sched_entity *se) { + struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg; - struct sched_entity *se; long shares; - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) + if (!cfs_rq) + return; + + if (throttled_hierarchy(cfs_rq)) return; + + tg = cfs_rq->tg; + #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) return; @@ -2707,54 +2719,23 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } + #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* Precomputed fixed inverse multiplies for multiplication by y^n */ -static const u32 runnable_avg_yN_inv[] = { - 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, - 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, - 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, - 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, - 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, - 0x85aac367, 0x82cd8698, -}; - -/* - * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent - * over-estimates when re-combining. - */ -static const u32 runnable_avg_yN_sum[] = { - 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, - 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, - 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, -}; - -/* - * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to - * lower integers. See Documentation/scheduler/sched-avg.txt how these - * were generated: - */ -static const u32 __accumulated_sum_N32[] = { - 0, 23371, 35056, 40899, 43820, 45281, - 46011, 46376, 46559, 46650, 46696, 46719, -}; - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ -static __always_inline u64 decay_load(u64 val, u64 n) +static u64 decay_load(u64 val, u64 n) { unsigned int local_n; - if (!n) - return val; - else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + if (unlikely(n > LOAD_AVG_PERIOD * 63)) return 0; /* after bounds checking we can collapse to 32-bit */ @@ -2776,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n) return val; } +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) +{ + u32 c1, c2, c3 = d3; /* y^0 == 1 */ + + /* + * c1 = d1 y^p + */ + c1 = decay_load((u64)d1, periods); + + /* + * p-1 + * c2 = 1024 \Sum y^n + * n=1 + * + * inf inf + * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) + * n=0 n=p + */ + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; + + return c1 + c2 + c3; +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* - * For updates fully spanning n periods, the contribution to runnable - * average will be: \Sum 1024*y^n + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p-1 + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 + * n=1 * - * We can compute this reasonably efficiently by combining: - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} + * = u y^p + (Step 1) + * + * p-1 + * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 */ -static u32 __compute_runnable_contrib(u64 n) +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u32 contrib = 0; + unsigned long scale_freq, scale_cpu; + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ + u64 periods; - if (likely(n <= LOAD_AVG_PERIOD)) - return runnable_avg_yN_sum[n]; - else if (unlikely(n >= LOAD_AVG_MAX_N)) - return LOAD_AVG_MAX; + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; - n %= LOAD_AVG_PERIOD; - contrib = decay_load(contrib, n); - return contrib + runnable_avg_yN_sum[n]; -} + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} /* * We can represent the historical contribution to runnable average as the @@ -2830,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -__update_load_avg(u64 now, int cpu, struct sched_avg *sa, +___update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, scaled_delta, periods; - u32 contrib; - unsigned int delta_w, scaled_delta_w, decayed = 0; - unsigned long scale_freq, scale_cpu; + u64 delta; delta = now - sa->last_update_time; /* @@ -2855,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, delta >>= 10; if (!delta) return 0; - sa->last_update_time = now; - scale_freq = arch_scale_freq_capacity(NULL, cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + sa->last_update_time += delta << 10; - /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->period_contrib; - if (delta + delta_w >= 1024) { - decayed = 1; - - /* how much left for next period will start over, we don't know yet */ - sa->period_contrib = 0; - - /* - * Now that we know we're crossing a period boundary, figure - * out how much from delta we need to complete the current - * period and accrue it. - */ - delta_w = 1024 - delta_w; - scaled_delta_w = cap_scale(delta_w, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta_w; - if (cfs_rq) { - cfs_rq->runnable_load_sum += - weight * scaled_delta_w; - } - } - if (running) - sa->util_sum += scaled_delta_w * scale_cpu; - - delta -= delta_w; - - /* Figure out how many additional periods this update spans */ - periods = delta / 1024; - delta %= 1024; + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + return 0; - sa->load_sum = decay_load(sa->load_sum, periods + 1); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods + 1); - } - sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); - - /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - contrib = __compute_runnable_contrib(periods); - contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } - if (running) - sa->util_sum += contrib * scale_cpu; + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - /* Remainder of delta accrued against u_0` */ - scaled_delta = cap_scale(delta, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * scaled_delta; - } - if (running) - sa->util_sum += scaled_delta * scale_cpu; + return 1; +} - sa->period_contrib += delta; +static int +__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); +} - if (decayed) { - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); - } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - } +static int +__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); +} - return decayed; +static int +__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +{ + return ___update_load_avg(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + cfs_rq->curr != NULL, cfs_rq); } /* @@ -2995,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { + u64 p_last_update_time; + u64 n_last_update_time; + if (!sched_feat(ATTACH_AGE_LOAD)) return; @@ -3005,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se, * time. This will result in the wakee task is less decayed, but giving * the wakee more load sounds not bad. */ - if (se->avg.last_update_time && prev) { - u64 p_last_update_time; - u64 n_last_update_time; + if (!(se->avg.last_update_time && prev)) + return; #ifndef CONFIG_64BIT + { u64 p_last_update_time_copy; u64 n_last_update_time_copy; @@ -3024,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se, } while (p_last_update_time != p_last_update_time_copy || n_last_update_time != n_last_update_time_copy); + } #else - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; #endif - __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), - &se->avg, 0, 0, NULL); - se->avg.last_update_time = n_last_update_time; - } + __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); + se->avg.last_update_time = n_last_update_time; } /* Take into account change of utilization of a child task group */ @@ -3154,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 1; } +/* + * Check if we need to update the load and the utilization of a blocked + * group_entity: + */ +static inline bool skip_blocked_update(struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + /* + * If sched_entity still have not zero load or utilization, we have to + * decay it: + */ + if (se->avg.load_avg || se->avg.util_avg) + return false; + + /* + * If there is a pending propagation, we have to update the load and + * the utilization of the sched_entity: + */ + if (gcfs_rq->propagate_avg) + return false; + + /* + * Otherwise, the load and the utilization of the sched_entity is + * already zero and there is no pending propagation, so it will be a + * waste of time to try to decay it: + */ + return true; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} @@ -3246,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) set_tg_cfs_propagate(cfs_rq); } - decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); + decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -3279,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags) * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { - __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cpu, cfs_rq, se); decayed = update_cfs_rq_load_avg(now, cfs_rq, true); decayed |= propagate_entity_load_avg(se); @@ -3388,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se) u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); } /* @@ -3424,7 +3466,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int idle_balance(struct rq *this_rq); +static int idle_balance(struct rq *this_rq, struct rq_flags *rf); #else /* CONFIG_SMP */ @@ -3453,7 +3495,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq) +static inline int idle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -3582,10 +3624,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; + /* + * When enqueuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its weight to reflect the new share of + * its group cfs_rq + * - Add its new weight to cfs_rq->load.weight + */ update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); + update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); @@ -3657,6 +3707,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Substract its load from the cfs_rq->runnable_avg. + * - Substract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); @@ -3681,7 +3740,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); /* * Now advance min_vruntime if @se was the entity holding it back, @@ -3864,7 +3923,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_load_avg(curr, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -4235,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4253,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); if (!remaining) break; @@ -4761,7 +4821,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -4820,7 +4880,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -5061,15 +5121,16 @@ void cpu_load_update_nohz_stop(void) unsigned long curr_jiffies = READ_ONCE(jiffies); struct rq *this_rq = this_rq(); unsigned long load; + struct rq_flags rf; if (curr_jiffies == this_rq->last_load_update_tick) return; load = weighted_cpuload(cpu_of(this_rq)); - raw_spin_lock(&this_rq->lock); + rq_lock(this_rq, &rf); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); - raw_spin_unlock(&this_rq->lock); + rq_unlock(this_rq, &rf); } #else /* !CONFIG_NO_HZ_COMMON */ static inline void cpu_load_update_nohz(struct rq *this_rq, @@ -5424,7 +5485,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, /* Skip over this group if it has no CPUs allowed */ if (!cpumask_intersects(sched_group_cpus(group), - tsk_cpus_allowed(p))) + &p->cpus_allowed)) continue; local_group = cpumask_test_cpu(this_cpu, @@ -5544,7 +5605,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return cpumask_first(sched_group_cpus(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5683,7 +5744,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int if (!test_idle_cores(target, false)) return -1; - cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); + cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); for_each_cpu_wrap(core, cpus, target, wrap) { bool idle = true; @@ -5717,7 +5778,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) return cpu; @@ -5763,13 +5824,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. */ - if ((avg_idle / 512) < avg_cost) + if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost) return -1; time = local_clock(); for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { - if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) break; @@ -5924,7 +5985,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + && cpumask_test_cpu(cpu, &p->cpus_allowed); } rcu_read_lock(); @@ -6213,7 +6274,7 @@ preempt: } static struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -6320,15 +6381,8 @@ simple: return p; idle: - /* - * This is OK, because current is on_cpu, which avoids it being picked - * for load-balance and preemption/IRQs are still disabled avoiding - * further scheduler activity on it and we're being very careful to - * re-start the picking loop. - */ - lockdep_unpin_lock(&rq->lock, cookie); - new_tasks = idle_balance(rq); - lockdep_repin_lock(&rq->lock, cookie); + new_tasks = idle_balance(rq, rf); + /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -6671,7 +6725,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; - if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { int cpu; schedstat_inc(p->se.statistics.nr_failed_migrations_affine); @@ -6691,7 +6745,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Prevent to re-select dst_cpu via env's cpus */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; @@ -6740,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env) lockdep_assert_held(&env->src_rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - deactivate_task(env->src_rq, p, 0); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -6873,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -6884,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p) */ static void attach_one_task(struct rq *rq, struct task_struct *p) { - raw_spin_lock(&rq->lock); + struct rq_flags rf; + + rq_lock(rq, &rf); + update_rq_clock(rq); attach_task(rq, p); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -6897,8 +6954,10 @@ static void attach_tasks(struct lb_env *env) { struct list_head *tasks = &env->tasks; struct task_struct *p; + struct rq_flags rf; - raw_spin_lock(&env->dst_rq->lock); + rq_lock(env->dst_rq, &rf); + update_rq_clock(env->dst_rq); while (!list_empty(tasks)) { p = list_first_entry(tasks, struct task_struct, se.group_node); @@ -6907,7 +6966,7 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } - raw_spin_unlock(&env->dst_rq->lock); + rq_unlock(env->dst_rq, &rf); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6915,9 +6974,9 @@ static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); /* @@ -6925,6 +6984,8 @@ static void update_blocked_averages(int cpu) * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { + struct sched_entity *se; + /* throttled entities do not contribute to load */ if (throttled_hierarchy(cfs_rq)) continue; @@ -6932,11 +6993,12 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); - /* Propagate pending load changes to the parent */ - if (cfs_rq->tg->se[cpu]) - update_load_avg(cfs_rq->tg->se[cpu], 0); + /* Propagate pending load changes to the parent, if any: */ + se = cfs_rq->tg->se[cpu]; + if (se && !skip_blocked_update(se)) + update_load_avg(se, 0); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } /* @@ -6990,12 +7052,12 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } static unsigned long task_h_load(struct task_struct *p) @@ -7225,7 +7287,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) /* * Group imbalance indicates (and tries to solve) the problem where balancing - * groups is inadequate due to tsk_cpus_allowed() constraints. + * groups is inadequate due to ->cpus_allowed constraints. * * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a * cpumask covering 1 cpu of the first group and 3 cpus of the second group. @@ -7496,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; + struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; bool overload = false; @@ -7512,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); if (local_group) { sds->local = sg; - sgs = &sds->local_stat; + sgs = local; if (env->idle != CPU_NEWLY_IDLE || time_after_eq(jiffies, sg->sgc->next_update)) @@ -7536,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * the tasks on the system). */ if (prefer_sibling && sds->local && - group_has_capacity(env, &sds->local_stat) && - (sgs->sum_nr_running > 1)) { + group_has_capacity(env, local) && + (sgs->sum_nr_running > local->sum_nr_running + 1)) { sgs->group_no_capacity = 1; sgs->group_type = group_classify(sg, sgs); } @@ -7568,7 +7631,7 @@ next_group: /** * check_asym_packing - Check to see if the group is packed into the - * sched doman. + * sched domain. * * This is primarily intended to used at the sibling level. Some * cores like POWER7 prefer to use lower numbered SMT threads. In the @@ -8013,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; - unsigned long flags; + struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { @@ -8076,7 +8139,8 @@ redo: env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - raw_spin_lock_irqsave(&busiest->lock, flags); + rq_lock_irqsave(busiest, &rf); + update_rq_clock(busiest); /* * cur_ld_moved - load moved in current iteration @@ -8092,14 +8156,14 @@ more_balance: * See task_rq_lock() family for the details. */ - raw_spin_unlock(&busiest->lock); + rq_unlock(busiest, &rf); if (cur_ld_moved) { attach_tasks(&env); ld_moved += cur_ld_moved; } - local_irq_restore(flags); + local_irq_restore(rf.flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; @@ -8177,14 +8241,15 @@ more_balance: sd->nr_balance_failed++; if (need_active_balance(&env)) { + unsigned long flags; + raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, * if the curr task on busiest cpu can't be * moved to this_cpu */ - if (!cpumask_test_cpu(this_cpu, - tsk_cpus_allowed(busiest->curr))) { + if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); env.flags |= LBF_ALL_PINNED; @@ -8297,7 +8362,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static int idle_balance(struct rq *this_rq) +static int idle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -8311,6 +8376,14 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + rq_unpin_lock(this_rq, rf); + if (this_rq->avg_idle < sysctl_sched_migration_cost || !this_rq->rd->overload) { rcu_read_lock(); @@ -8388,6 +8461,8 @@ out: if (pulled_task) this_rq->idle_stamp = 0; + rq_repin_lock(this_rq, rf); + return pulled_task; } @@ -8405,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data) struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; struct task_struct *p = NULL; + struct rq_flags rf; - raw_spin_lock_irq(&busiest_rq->lock); + rq_lock_irq(busiest_rq, &rf); /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || @@ -8443,6 +8519,7 @@ static int active_load_balance_cpu_stop(void *data) }; schedstat_inc(sd->alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { @@ -8456,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock(&busiest_rq->lock); + rq_unlock(busiest_rq, &rf); if (p) attach_one_task(target_rq, p); @@ -8754,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * do the balance. */ if (time_after_eq(jiffies, rq->next_balance)) { - raw_spin_lock_irq(&rq->lock); + struct rq_flags rf; + + rq_lock_irq(rq, &rf); update_rq_clock(rq); cpu_load_update_idle(rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); + rebalance_domains(rq, CPU_IDLE); } @@ -8948,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; struct rq *rq = this_rq(); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); @@ -8970,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -9264,6 +9345,7 @@ void online_fair_sched_group(struct task_group *tg) se = tg->se[i]; raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); @@ -9331,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - unsigned long flags; /* * We can't change the weight of the root cgroup. @@ -9348,17 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) tg->shares = shares; for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - struct sched_entity *se; + struct sched_entity *se = tg->se[i]; + struct rq_flags rf; - se = tg->se[i]; /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - - /* Possible calls to update_curr() need rq clock */ + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); - raw_spin_unlock_irqrestore(&rq->lock, flags); + for_each_sched_entity(se) { + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); + } + rq_unlock_irqrestore(rq, &rf); } done: diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 69631fa46c2f..11192e0cb122 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -51,6 +51,18 @@ SCHED_FEAT(NONTASK_CAPACITY, true) */ SCHED_FEAT(TTWU_QUEUE, true) +/* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. + */ +SCHED_FEAT(SIS_AVG_CPU, false) + +/* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the + * annotations are not complete. + */ +SCHED_FEAT(WARN_DOUBLE_CLOCK, false) + #ifdef HAVE_RT_PUSH_IPI /* * In order to avoid a thundering herd attack of CPUs that are diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 6a4bae0a649d..2a25a9ec2c6e 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -2,6 +2,7 @@ * Generic entry point for the idle threads */ #include <linux/sched.h> +#include <linux/sched/idle.h> #include <linux/cpu.h> #include <linux/cpuidle.h> #include <linux/cpuhotplug.h> @@ -9,6 +10,7 @@ #include <linux/mm.h> #include <linux/stackprotector.h> #include <linux/suspend.h> +#include <linux/livepatch.h> #include <asm/tlb.h> @@ -264,6 +266,9 @@ static void do_idle(void) sched_ttwu_pending(); schedule_preempt_disabled(); + + if (unlikely(klp_patch_pending(current))) + klp_update_patch_state(current); } bool cpu_in_idle(unsigned long pc) diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 5405d3feb112..0c00172db63e 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl } static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { put_prev_task(rq, prev); update_idle_core(rq); diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a2d6eb71f06b..f15fb2bdbc0d 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -7,6 +7,7 @@ */ #include <linux/export.h> +#include <linux/sched/loadavg.h> #include "sched.h" @@ -168,7 +169,7 @@ static inline int calc_load_write_idx(void) * If the folding window started, make sure we start writing in the * next idle-delta. */ - if (!time_before(jiffies, calc_load_update)) + if (!time_before(jiffies, READ_ONCE(calc_load_update))) idx++; return idx & 1; @@ -201,8 +202,9 @@ void calc_load_exit_idle(void) struct rq *this_rq = this_rq(); /* - * If we're still before the sample window, we're done. + * If we're still before the pending sample window, we're done. */ + this_rq->calc_load_update = READ_ONCE(calc_load_update); if (time_before(jiffies, this_rq->calc_load_update)) return; @@ -211,7 +213,6 @@ void calc_load_exit_idle(void) * accounted through the nohz accounting, so skip the entire deal and * sync up for the next window. */ - this_rq->calc_load_update = calc_load_update; if (time_before(jiffies, this_rq->calc_load_update + 10)) this_rq->calc_load_update += LOAD_FREQ; } @@ -307,13 +308,15 @@ calc_load_n(unsigned long load, unsigned long exp, */ static void calc_global_nohz(void) { + unsigned long sample_window; long delta, active, n; - if (!time_before(jiffies, calc_load_update + 10)) { + sample_window = READ_ONCE(calc_load_update); + if (!time_before(jiffies, sample_window + 10)) { /* * Catch-up, fold however many we are behind still */ - delta = jiffies - calc_load_update - 10; + delta = jiffies - sample_window - 10; n = 1 + (delta / LOAD_FREQ); active = atomic_long_read(&calc_load_tasks); @@ -323,7 +326,7 @@ static void calc_global_nohz(void) avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - calc_load_update += n * LOAD_FREQ; + WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ); } /* @@ -351,9 +354,11 @@ static inline void calc_global_nohz(void) { } */ void calc_global_load(unsigned long ticks) { + unsigned long sample_window; long active, delta; - if (time_before(jiffies, calc_load_update + 10)) + sample_window = READ_ONCE(calc_load_update); + if (time_before(jiffies, sample_window + 10)) return; /* @@ -370,7 +375,7 @@ void calc_global_load(unsigned long ticks) avenrun[1] = calc_load(avenrun[1], EXP_5, active); avenrun[2] = calc_load(avenrun[2], EXP_15, active); - calc_load_update += LOAD_FREQ; + WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); /* * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2516b8df6dbb..979b7341008a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -9,6 +9,7 @@ #include <linux/irq_work.h> int sched_rr_timeslice = RR_TIMESLICE; +int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -334,7 +335,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (tsk_nr_cpus_allowed(p) > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -351,7 +352,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (tsk_nr_cpus_allowed(p) > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -1323,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags); - if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1412,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (tsk_nr_cpus_allowed(curr) < 2 || + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio)) { int target = find_lowest_rq(p); @@ -1436,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (tsk_nr_cpus_allowed(rq->curr) == 1 || + if (rq->curr->nr_cpus_allowed == 1 || !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; @@ -1444,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (tsk_nr_cpus_allowed(p) != 1 + if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1523,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } static struct task_struct * -pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; @@ -1535,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie coo * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); pull_rt_task(rq); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a dl or stop task can slip in, in which case we need @@ -1578,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1) + if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1590,7 +1591,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + cpumask_test_cpu(cpu, &p->cpus_allowed)) return 1; return 0; } @@ -1628,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (tsk_nr_cpus_allowed(task) == 1) + if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1725,8 +1726,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(lowest_rq->cpu, - tsk_cpus_allowed(task)) || + !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { @@ -1761,7 +1761,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(tsk_nr_cpus_allowed(p) <= 1); + BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); @@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq) #define RT_PUSH_IPI_EXECUTING 1 #define RT_PUSH_IPI_RESTART 2 +/* + * When a high priority task schedules out from a CPU and a lower priority + * task is scheduled in, a check is made to see if there's any RT tasks + * on other CPUs that are waiting to run because a higher priority RT task + * is currently running on its CPU. In this case, the CPU with multiple RT + * tasks queued on it (overloaded) needs to be notified that a CPU has opened + * up that may be able to run one of its non-running queued RT tasks. + * + * On large CPU boxes, there's the case that several CPUs could schedule + * a lower priority task at the same time, in which case it will look for + * any overloaded CPUs that it could pull a task from. To do this, the runqueue + * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting + * for a single overloaded CPU's runqueue lock can produce a large latency. + * (This has actually been observed on large boxes running cyclictest). + * Instead of taking the runqueue lock of the overloaded CPU, each of the + * CPUs that scheduled a lower priority task simply sends an IPI to the + * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with + * lots of contention. The overloaded CPU will look to push its non-running + * RT task off, and if it does, it can then ignore the other IPIs coming + * in, and just pass those IPIs off to any other overloaded CPU. + * + * When a CPU schedules a lower priority task, it only sends an IPI to + * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, + * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with + * RT overloaded tasks, would cause 100 IPIs to go out at once. + * + * The overloaded RT CPU, when receiving an IPI, will try to push off its + * overloaded RT tasks and then send an IPI to the next CPU that has + * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks + * have completed. Just because a CPU may have pushed off its own overloaded + * RT task does not mean it should stop sending the IPI around to other + * overloaded CPUs. There may be another RT task waiting to run on one of + * those CPUs that are of higher priority than the one that was just + * pushed. + * + * An optimization that could possibly be made is to make a CPU array similar + * to the cpupri array mask of all running RT tasks, but for the overloaded + * case, then the IPI could be sent to only the CPU with the highest priority + * RT task waiting, and that CPU could send off further IPIs to the CPU with + * the next highest waiting task. Since the overloaded case is much less likely + * to happen, the complexity of this implementation may not be worth it. + * Instead, just send an IPI around to all overloaded CPUs. + * + * The rq->rt.push_flags holds the status of the IPI that is going around. + * A run queue can only send out a single IPI at a time. The possible flags + * for rq->rt.push_flags are: + * + * (None or zero): No IPI is going around for the current rq + * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around + * RT_PUSH_IPI_RESTART: The priority of the running task for the rq + * has changed, and the IPI should restart + * circulating the overloaded CPUs again. + * + * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated + * before sending to the next CPU. + * + * Instead of having all CPUs that schedule a lower priority task send + * an IPI to the same "first" CPU in the RT overload mask, they send it + * to the next overloaded CPU after their own CPU. This helps distribute + * the work when there's more than one overloaded CPU and multiple CPUs + * scheduling in lower priority tasks. + * + * When a rq schedules a lower priority task than what was currently + * running, the next CPU with overloaded RT tasks is examined first. + * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower + * priority task, it will send an IPI first to CPU 5, then CPU 5 will + * send to CPU 1 if it is still overloaded. CPU 1 will clear the + * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. + * + * The first CPU to notice IPI_RESTART is set, will clear that flag and then + * send an IPI to the next overloaded CPU after the rq->cpu and not the next + * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 + * schedules a lower priority task, and the IPI_RESTART gets set while the + * handling is being done on CPU 5, it will clear the flag and send it back to + * CPU 4 instead of CPU 1. + * + * Note, the above logic can be disabled by turning off the sched_feature + * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be + * taken by the CPU requesting a pull and the waiting RT task will be pulled + * by that CPU. This may be fine for machines with few CPUs. + */ static void tell_cpu_to_push(struct rq *rq) { int cpu; @@ -2121,9 +2202,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - tsk_nr_cpus_allowed(p) > 1 && + p->nr_cpus_allowed > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && - (tsk_nr_cpus_allowed(rq->curr) < 2 || + (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -2196,12 +2277,11 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) queue_push_tasks(rq); -#else +#endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio) resched_curr(rq); -#endif /* CONFIG_SMP */ } } @@ -2246,6 +2326,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) } } +#ifdef CONFIG_POSIX_TIMERS static void watchdog(struct rq *rq, struct task_struct *p) { unsigned long soft, hard; @@ -2267,6 +2348,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) p->cputime_expires.sched_exp = p->se.sum_exec_runtime; } } +#else +static inline void watchdog(struct rq *rq, struct task_struct *p) { } +#endif static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h new file mode 100644 index 000000000000..cd200d16529e --- /dev/null +++ b/kernel/sched/sched-pelt.h @@ -0,0 +1,13 @@ +/* Generated by Documentation/scheduler/sched-pelt; do not modify. */ + +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7b34c7826ca5..7808ab050599 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,9 +1,27 @@ #include <linux/sched.h> +#include <linux/sched/autogroup.h> #include <linux/sched/sysctl.h> +#include <linux/sched/topology.h> #include <linux/sched/rt.h> -#include <linux/u64_stats_sync.h> #include <linux/sched/deadline.h> +#include <linux/sched/clock.h> +#include <linux/sched/wake_q.h> +#include <linux/sched/signal.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/mm.h> +#include <linux/sched/cpufreq.h> +#include <linux/sched/stat.h> +#include <linux/sched/nohz.h> +#include <linux/sched/debug.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/cputime.h> +#include <linux/sched/init.h> + +#include <linux/u64_stats_sync.h> +#include <linux/kernel_stat.h> #include <linux/binfmts.h> #include <linux/mutex.h> #include <linux/spinlock.h> @@ -12,6 +30,10 @@ #include <linux/tick.h> #include <linux/slab.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif + #include "cpupri.h" #include "cpudeadline.h" #include "cpuacct.h" @@ -222,7 +244,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } -extern struct mutex sched_domains_mutex; +extern void init_dl_bw(struct dl_bw *dl_b); #ifdef CONFIG_CGROUP_SCHED @@ -583,6 +605,13 @@ struct root_domain { }; extern struct root_domain def_root_domain; +extern struct mutex sched_domains_mutex; +extern cpumask_var_t fallback_doms; +extern cpumask_var_t sched_domains_tmpmask; + +extern void init_defrootdomain(void); +extern int init_sched_domains(const struct cpumask *cpu_map); +extern void rq_attach_root(struct rq *rq, struct root_domain *rd); #endif /* CONFIG_SMP */ @@ -644,7 +673,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - unsigned int clock_skip_update; + unsigned int clock_update_flags; u64 clock; u64 clock_task; @@ -768,28 +797,110 @@ static inline u64 __rq_clock_broken(struct rq *rq) return READ_ONCE(rq->clock); } +/* + * rq::clock_update_flags bits + * + * %RQCF_REQ_SKIP - will request skipping of clock update on the next + * call to __schedule(). This is an optimisation to avoid + * neighbouring rq clock updates. + * + * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is + * in effect and calls to update_rq_clock() are being ignored. + * + * %RQCF_UPDATED - is a debug flag that indicates whether a call has been + * made to update_rq_clock() since the last time rq::lock was pinned. + * + * If inside of __schedule(), clock_update_flags will have been + * shifted left (a left shift is a cheap operation for the fast path + * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, + * + * if (rq-clock_update_flags >= RQCF_UPDATED) + * + * to check if %RQCF_UPADTED is set. It'll never be shifted more than + * one position though, because the next rq_unpin_lock() will shift it + * back. + */ +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 + +static inline void assert_clock_updated(struct rq *rq) +{ + /* + * The only reason for not seeing a clock update since the + * last rq_pin_lock() is if we're currently skipping updates. + */ + SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); +} + static inline u64 rq_clock(struct rq *rq) { lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + return rq->clock_task; } -#define RQCF_REQ_SKIP 0x01 -#define RQCF_ACT_SKIP 0x02 - static inline void rq_clock_skip_update(struct rq *rq, bool skip) { lockdep_assert_held(&rq->lock); if (skip) - rq->clock_skip_update |= RQCF_REQ_SKIP; + rq->clock_update_flags |= RQCF_REQ_SKIP; else - rq->clock_skip_update &= ~RQCF_REQ_SKIP; + rq->clock_update_flags &= ~RQCF_REQ_SKIP; +} + +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +#ifdef CONFIG_SCHED_DEBUG + /* + * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the + * current pin context is stashed here in case it needs to be + * restored in rq_repin_lock(). + */ + unsigned int clock_update_flags; +#endif +}; + +static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) +{ + rf->cookie = lockdep_pin_lock(&rq->lock); + +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + rf->clock_update_flags = 0; +#endif +} + +static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SCHED_DEBUG + if (rq->clock_update_flags > RQCF_ACT_SKIP) + rf->clock_update_flags = RQCF_UPDATED; +#endif + + lockdep_unpin_lock(&rq->lock, rf->cookie); +} + +static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) +{ + lockdep_repin_lock(&rq->lock, rf->cookie); + +#ifdef CONFIG_SCHED_DEBUG + /* + * Restore the value we stashed in @rf for this pin context. + */ + rq->clock_update_flags |= rf->clock_update_flags; +#endif } #ifdef CONFIG_NUMA @@ -803,6 +914,16 @@ extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); #endif +#ifdef CONFIG_NUMA +extern void sched_init_numa(void); +extern void sched_domains_numa_masks_set(unsigned int cpu); +extern void sched_domains_numa_masks_clear(unsigned int cpu); +#else +static inline void sched_init_numa(void) { } +static inline void sched_domains_numa_masks_set(unsigned int cpu) { } +static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } +#endif + #ifdef CONFIG_NUMA_BALANCING /* The regions in numa_faults array from task_struct */ enum numa_faults_stats { @@ -969,7 +1090,7 @@ static inline void sched_ttwu_pending(void) { } #endif /* CONFIG_SMP */ #include "stats.h" -#include "auto_group.h" +#include "autogroup.h" #ifdef CONFIG_CGROUP_SCHED @@ -1210,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 #define ENQUEUE_MOVE 0x04 +#define ENQUEUE_NOCLOCK 0x08 -#define ENQUEUE_HEAD 0x08 -#define ENQUEUE_REPLENISH 0x10 +#define ENQUEUE_HEAD 0x10 +#define ENQUEUE_REPLENISH 0x20 #ifdef CONFIG_SMP -#define ENQUEUE_MIGRATED 0x20 +#define ENQUEUE_MIGRATED 0x40 #else #define ENQUEUE_MIGRATED 0x00 #endif @@ -1245,7 +1368,7 @@ struct sched_class { */ struct task_struct * (*pick_next_task) (struct rq *rq, struct task_struct *prev, - struct pin_cookie cookie); + struct rq_flags *rf); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP @@ -1501,13 +1624,9 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -struct rq_flags { - unsigned long flags; - struct pin_cookie cookie; -}; - struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); + struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); @@ -1515,7 +1634,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { - lockdep_unpin_lock(&rq->lock, rf->cookie); + rq_unpin_lock(rq, rf); raw_spin_unlock(&rq->lock); } @@ -1524,11 +1643,67 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - lockdep_unpin_lock(&rq->lock, rf->cookie); + rq_unpin_lock(rq, rf); raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT @@ -1674,6 +1849,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +extern void set_rq_online (struct rq *rq); +extern void set_rq_offline(struct rq *rq); +extern bool sched_smp_initialized; + #else /* CONFIG_SMP */ /* @@ -1718,7 +1897,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); - #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); @@ -1750,14 +1928,19 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { - u64 hardirq_time; - u64 softirq_time; + u64 total; + u64 tick_delta; u64 irq_start_time; struct u64_stats_sync sync; }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +/* + * Returns the irqtime minus the softirq time computed by ksoftirqd. + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * and never move forward. + */ static inline u64 irq_time_read(int cpu) { struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); @@ -1766,7 +1949,7 @@ static inline u64 irq_time_read(int cpu) do { seq = __u64_stats_fetch_begin(&irqtime->sync); - total = irqtime->softirq_time + irqtime->hardirq_time; + total = irqtime->total; } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); return total; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 34659a853505..d5710651043b 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -164,106 +164,3 @@ sched_info_switch(struct rq *rq, #define sched_info_arrive(rq, next) do { } while (0) #define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ - -/* - * The following are functions that support scheduler-internal time accounting. - * These functions are generally called at the timer tick. None of this depends - * on CONFIG_SCHEDSTATS. - */ - -/** - * cputimer_running - return true if cputimer is running - * - * @tsk: Pointer to target task. - */ -static inline bool cputimer_running(struct task_struct *tsk) - -{ - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - - /* Check if cputimer isn't running. This is accessed without locking. */ - if (!READ_ONCE(cputimer->running)) - return false; - - /* - * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime - * in __exit_signal(), we won't account to the signal struct further - * cputime consumed by that task, even though the task can still be - * ticking after __exit_signal(). - * - * In order to keep a consistent behaviour between thread group cputime - * and thread group cputimer accounting, lets also ignore the cputime - * elapsing after __exit_signal() in any thread group timer running. - * - * This makes sure that POSIX CPU clocks and timers are synchronized, so - * that a POSIX CPU timer won't expire while the corresponding POSIX CPU - * clock delta is behind the expiring timer value. - */ - if (unlikely(!tsk->sighand)) - return false; - - return true; -} - -/** - * account_group_user_time - Maintain utime for a thread group. - * - * @tsk: Pointer to task structure. - * @cputime: Time value by which to increment the utime field of the - * thread_group_cputime structure. - * - * If thread group time is being maintained, get the structure for the - * running CPU and update the utime field there. - */ -static inline void account_group_user_time(struct task_struct *tsk, - cputime_t cputime) -{ - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - - if (!cputimer_running(tsk)) - return; - - atomic64_add(cputime, &cputimer->cputime_atomic.utime); -} - -/** - * account_group_system_time - Maintain stime for a thread group. - * - * @tsk: Pointer to task structure. - * @cputime: Time value by which to increment the stime field of the - * thread_group_cputime structure. - * - * If thread group time is being maintained, get the structure for the - * running CPU and update the stime field there. - */ -static inline void account_group_system_time(struct task_struct *tsk, - cputime_t cputime) -{ - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - - if (!cputimer_running(tsk)) - return; - - atomic64_add(cputime, &cputimer->cputime_atomic.stime); -} - -/** - * account_group_exec_runtime - Maintain exec runtime for a thread group. - * - * @tsk: Pointer to task structure. - * @ns: Time value by which to increment the sum_exec_runtime field - * of the thread_group_cputime structure. - * - * If thread group time is being maintained, get the structure for the - * running CPU and update the sum_exec_runtime field there. - */ -static inline void account_group_exec_runtime(struct task_struct *tsk, - unsigned long long ns) -{ - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - - if (!cputimer_running(tsk)) - return; - - atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime); -} diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 604297a08b3a..9f69fb630853 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) } static struct task_struct * -pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *stop = rq->stop; diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 82f0dff90030..3d5610dcce11 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -1,4 +1,4 @@ -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/swait.h> void __init_swait_queue_head(struct swait_queue_head *q, const char *name, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c new file mode 100644 index 000000000000..1b0b4fb12837 --- /dev/null +++ b/kernel/sched/topology.c @@ -0,0 +1,1658 @@ +/* + * Scheduler topology setup/handling methods + */ +#include <linux/sched.h> +#include <linux/mutex.h> + +#include "sched.h" + +DEFINE_MUTEX(sched_domains_mutex); + +/* Protected by sched_domains_mutex: */ +cpumask_var_t sched_domains_tmpmask; + +#ifdef CONFIG_SCHED_DEBUG + +static __read_mostly int sched_debug_enabled; + +static int __init sched_debug_setup(char *str) +{ + sched_debug_enabled = 1; + + return 0; +} +early_param("sched_debug", sched_debug_setup); + +static inline bool sched_debug(void) +{ + return sched_debug_enabled; +} + +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, + struct cpumask *groupmask) +{ + struct sched_group *group = sd->groups; + + cpumask_clear(groupmask); + + printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); + return -1; + } + + printk(KERN_CONT "span %*pbl level %s\n", + cpumask_pr_args(sched_domain_span(sd)), sd->name); + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); + } + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); + } + + printk(KERN_DEBUG "%*s groups:", level + 1, ""); + do { + if (!group) { + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); + break; + } + + if (!cpumask_weight(sched_group_cpus(group))) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: empty group\n"); + break; + } + + if (!(sd->flags & SD_OVERLAP) && + cpumask_intersects(groupmask, sched_group_cpus(group))) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); + break; + } + + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + + printk(KERN_CONT " %*pbl", + cpumask_pr_args(sched_group_cpus(group))); + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { + printk(KERN_CONT " (cpu_capacity = %lu)", + group->sgc->capacity); + } + + group = group->next; + } while (group != sd->groups); + printk(KERN_CONT "\n"); + + if (!cpumask_equal(sched_domain_span(sd), groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); + return 0; +} + +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + int level = 0; + + if (!sched_debug_enabled) + return; + + if (!sd) { + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); + return; + } + + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + + for (;;) { + if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) + break; + level++; + sd = sd->parent; + if (!sd) + break; + } +} +#else /* !CONFIG_SCHED_DEBUG */ + +# define sched_debug_enabled 0 +# define sched_domain_debug(sd, cpu) do { } while (0) +static inline bool sched_debug(void) +{ + return false; +} +#endif /* CONFIG_SCHED_DEBUG */ + +static int sd_degenerate(struct sched_domain *sd) +{ + if (cpumask_weight(sched_domain_span(sd)) == 1) + return 1; + + /* Following flags need at least 2 groups */ + if (sd->flags & (SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_SHARE_CPUCAPACITY | + SD_ASYM_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_SHARE_POWERDOMAIN)) { + if (sd->groups != sd->groups->next) + return 0; + } + + /* Following flags don't use groups */ + if (sd->flags & (SD_WAKE_AFFINE)) + return 0; + + return 1; +} + +static int +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) +{ + unsigned long cflags = sd->flags, pflags = parent->flags; + + if (sd_degenerate(parent)) + return 1; + + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) + return 0; + + /* Flags needing groups don't count if only 1 group in parent */ + if (parent->groups == parent->groups->next) { + pflags &= ~(SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | + SD_SHARE_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); + if (nr_node_ids == 1) + pflags &= ~SD_SERIALIZE; + } + if (~cflags & pflags) + return 0; + + return 1; +} + +static void free_rootdomain(struct rcu_head *rcu) +{ + struct root_domain *rd = container_of(rcu, struct root_domain, rcu); + + cpupri_cleanup(&rd->cpupri); + cpudl_cleanup(&rd->cpudl); + free_cpumask_var(rd->dlo_mask); + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + +void rq_attach_root(struct rq *rq, struct root_domain *rd) +{ + struct root_domain *old_rd = NULL; + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + old_rd = rq->rd; + + if (cpumask_test_cpu(rq->cpu, old_rd->online)) + set_rq_offline(rq); + + cpumask_clear_cpu(rq->cpu, old_rd->span); + + /* + * If we dont want to free the old_rd yet then + * set old_rd to NULL to skip the freeing later + * in this function: + */ + if (!atomic_dec_and_test(&old_rd->refcount)) + old_rd = NULL; + } + + atomic_inc(&rd->refcount); + rq->rd = rd; + + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + if (old_rd) + call_rcu_sched(&old_rd->rcu, free_rootdomain); +} + +static int init_rootdomain(struct root_domain *rd) +{ + memset(rd, 0, sizeof(*rd)); + + if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto out; + if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) + goto free_online; + if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_dlo_mask; + + init_dl_bw(&rd->dl_bw); + if (cpudl_init(&rd->cpudl) != 0) + goto free_rto_mask; + + if (cpupri_init(&rd->cpupri) != 0) + goto free_cpudl; + return 0; + +free_cpudl: + cpudl_cleanup(&rd->cpudl); +free_rto_mask: + free_cpumask_var(rd->rto_mask); +free_dlo_mask: + free_cpumask_var(rd->dlo_mask); +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +out: + return -ENOMEM; +} + +/* + * By default the system creates a single root-domain with all CPUs as + * members (mimicking the global state we have today). + */ +struct root_domain def_root_domain; + +void init_defrootdomain(void) +{ + init_rootdomain(&def_root_domain); + + atomic_set(&def_root_domain.refcount, 1); +} + +static struct root_domain *alloc_rootdomain(void) +{ + struct root_domain *rd; + + rd = kmalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return NULL; + + if (init_rootdomain(rd) != 0) { + kfree(rd); + return NULL; + } + + return rd; +} + +static void free_sched_groups(struct sched_group *sg, int free_sgc) +{ + struct sched_group *tmp, *first; + + if (!sg) + return; + + first = sg; + do { + tmp = sg->next; + + if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) + kfree(sg->sgc); + + kfree(sg); + sg = tmp; + } while (sg != first); +} + +static void destroy_sched_domain(struct sched_domain *sd) +{ + /* + * If its an overlapping domain it has private groups, iterate and + * nuke them all. + */ + if (sd->flags & SD_OVERLAP) { + free_sched_groups(sd->groups, 1); + } else if (atomic_dec_and_test(&sd->groups->ref)) { + kfree(sd->groups->sgc); + kfree(sd->groups); + } + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) + kfree(sd->shared); + kfree(sd); +} + +static void destroy_sched_domains_rcu(struct rcu_head *rcu) +{ + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + + while (sd) { + struct sched_domain *parent = sd->parent; + destroy_sched_domain(sd); + sd = parent; + } +} + +static void destroy_sched_domains(struct sched_domain *sd) +{ + if (sd) + call_rcu(&sd->rcu, destroy_sched_domains_rcu); +} + +/* + * Keep a special pointer to the highest sched_domain that has + * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this + * allows us to avoid some pointer chasing select_idle_sibling(). + * + * Also keep a unique ID per domain (we use the first CPU number in + * the cpumask of the domain), this allows us to quickly tell if + * two CPUs are in the same cache domain, see cpus_share_cache(). + */ +DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(int, sd_llc_size); +DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); +DEFINE_PER_CPU(struct sched_domain *, sd_numa); +DEFINE_PER_CPU(struct sched_domain *, sd_asym); + +static void update_top_cache_domain(int cpu) +{ + struct sched_domain_shared *sds = NULL; + struct sched_domain *sd; + int id = cpu; + int size = 1; + + sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); + if (sd) { + id = cpumask_first(sched_domain_span(sd)); + size = cpumask_weight(sched_domain_span(sd)); + sds = sd->shared; + } + + rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); + per_cpu(sd_llc_size, cpu) = size; + per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + + sd = lowest_flag_domain(cpu, SD_NUMA); + rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); + + sd = highest_flag_domain(cpu, SD_ASYM_PACKING); + rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); +} + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +static void +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct sched_domain *tmp; + + /* Remove the sched domains which do not contribute to scheduling. */ + for (tmp = sd; tmp; ) { + struct sched_domain *parent = tmp->parent; + if (!parent) + break; + + if (sd_parent_degenerate(tmp, parent)) { + tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + /* + * Transfer SD_PREFER_SIBLING down in case of a + * degenerate parent; the spans match for this + * so the property transfers. + */ + if (parent->flags & SD_PREFER_SIBLING) + tmp->flags |= SD_PREFER_SIBLING; + destroy_sched_domain(parent); + } else + tmp = tmp->parent; + } + + if (sd && sd_degenerate(sd)) { + tmp = sd; + sd = sd->parent; + destroy_sched_domain(tmp); + if (sd) + sd->child = NULL; + } + + sched_domain_debug(sd, cpu); + + rq_attach_root(rq, rd); + tmp = rq->sd; + rcu_assign_pointer(rq->sd, sd); + destroy_sched_domains(tmp); + + update_top_cache_domain(cpu); +} + +/* Setup the mask of CPUs configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ + int ret; + + alloc_bootmem_cpumask_var(&cpu_isolated_map); + ret = cpulist_parse(str, cpu_isolated_map); + if (ret) { + pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + return 0; + } + return 1; +} +__setup("isolcpus=", isolated_cpu_setup); + +struct s_data { + struct sched_domain ** __percpu sd; + struct root_domain *rd; +}; + +enum s_alloc { + sa_rootdomain, + sa_sd, + sa_sd_storage, + sa_none, +}; + +/* + * Build an iteration mask that can exclude certain CPUs from the upwards + * domain traversal. + * + * Asymmetric node setups can result in situations where the domain tree is of + * unequal depth, make sure to skip domains that already cover the entire + * range. + * + * In that case build_sched_domains() will have terminated the iteration early + * and our sibling sd spans will be empty. Domains should always include the + * CPU they're built on, so check that. + */ +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +{ + const struct cpumask *span = sched_domain_span(sd); + struct sd_data *sdd = sd->private; + struct sched_domain *sibling; + int i; + + for_each_cpu(i, span) { + sibling = *per_cpu_ptr(sdd->sd, i); + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + continue; + + cpumask_set_cpu(i, sched_group_mask(sg)); + } +} + +/* + * Return the canonical balance CPU for this group, this is the first CPU + * of this group that's also in the iteration mask. + */ +int group_balance_cpu(struct sched_group *sg) +{ + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} + +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered = sched_domains_tmpmask; + struct sd_data *sdd = sd->private; + struct sched_domain *sibling; + int i; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct cpumask *sg_span; + + if (cpumask_test_cpu(i, covered)) + continue; + + sibling = *per_cpu_ptr(sdd->sd, i); + + /* See the comment near build_group_mask(). */ + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + continue; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(cpu)); + + if (!sg) + goto fail; + + sg_span = sched_group_cpus(sg); + if (sibling->child) + cpumask_copy(sg_span, sched_domain_span(sibling->child)); + else + cpumask_set_cpu(i, sg_span); + + cpumask_or(covered, covered, sg_span); + + sg->sgc = *per_cpu_ptr(sdd->sgc, i); + if (atomic_inc_return(&sg->sgc->ref) == 1) + build_group_mask(sd, sg); + + /* + * Initialize sgc->capacity such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + + /* + * Make sure the first group of this domain contains the + * canonical balance CPU. Otherwise the sched_domain iteration + * breaks. See update_sg_lb_stats(). + */ + if ((!groups && cpumask_test_cpu(cpu, sg_span)) || + group_balance_cpu(sg) == cpu) + groups = sg; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + last->next = first; + } + sd->groups = groups; + + return 0; + +fail: + free_sched_groups(first, 0); + + return -ENOMEM; +} + +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) +{ + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + struct sched_domain *child = sd->child; + + if (child) + cpu = cpumask_first(sched_domain_span(child)); + + if (sg) { + *sg = *per_cpu_ptr(sdd->sg, cpu); + (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + + /* For claim_allocations: */ + atomic_set(&(*sg)->sgc->ref, 1); + } + + return cpu; +} + +/* + * build_sched_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_capacity to 0. + * + * Assumes the sched_domain tree is fully constructed + */ +static int +build_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL; + struct sd_data *sdd = sd->private; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered; + int i; + + get_group(cpu, sdd, &sd->groups); + atomic_inc(&sd->groups->ref); + + if (cpu != cpumask_first(span)) + return 0; + + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct sched_group *sg; + int group, j; + + if (cpumask_test_cpu(i, covered)) + continue; + + group = get_group(i, sdd, &sg); + cpumask_setall(sched_group_mask(sg)); + + for_each_cpu(j, span) { + if (get_group(j, sdd, NULL) != group) + continue; + + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); + } + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; + + return 0; +} + +/* + * Initialize sched groups cpu_capacity. + * + * cpu_capacity indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_capacity for all the groups in a sched domain will be same + * unless there are asymmetries in the topology. If there are asymmetries, + * group having more cpu_capacity will pickup more load compared to the + * group having less cpu_capacity. + */ +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) +{ + struct sched_group *sg = sd->groups; + + WARN_ON(!sg); + + do { + int cpu, max_cpu = -1; + + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + + if (!(sd->flags & SD_ASYM_PACKING)) + goto next; + + for_each_cpu(cpu, sched_group_cpus(sg)) { + if (max_cpu < 0) + max_cpu = cpu; + else if (sched_asym_prefer(cpu, max_cpu)) + max_cpu = cpu; + } + sg->asym_prefer_cpu = max_cpu; + +next: + sg = sg->next; + } while (sg != sd->groups); + + if (cpu != group_balance_cpu(sg)) + return; + + update_group_capacity(sd, cpu); +} + +/* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +static int default_relax_domain_level = -1; +int sched_domain_level_max; + +static int __init setup_relax_domain_level(char *str) +{ + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); + + return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, + struct sched_domain_attr *attr) +{ + int request; + + if (!attr || attr->relax_domain_level < 0) { + if (default_relax_domain_level < 0) + return; + else + request = default_relax_domain_level; + } else + request = attr->relax_domain_level; + if (request < sd->level) { + /* Turn off idle balance on this domain: */ + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); + } else { + /* Turn on idle balance on this domain: */ + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); + } +} + +static void __sdt_free(const struct cpumask *cpu_map); +static int __sdt_alloc(const struct cpumask *cpu_map); + +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, + const struct cpumask *cpu_map) +{ + switch (what) { + case sa_rootdomain: + if (!atomic_read(&d->rd->refcount)) + free_rootdomain(&d->rd->rcu); + /* Fall through */ + case sa_sd: + free_percpu(d->sd); + /* Fall through */ + case sa_sd_storage: + __sdt_free(cpu_map); + /* Fall through */ + case sa_none: + break; + } +} + +static enum s_alloc +__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) +{ + memset(d, 0, sizeof(*d)); + + if (__sdt_alloc(cpu_map)) + return sa_sd_storage; + d->sd = alloc_percpu(struct sched_domain *); + if (!d->sd) + return sa_sd_storage; + d->rd = alloc_rootdomain(); + if (!d->rd) + return sa_sd; + return sa_rootdomain; +} + +/* + * NULL the sd_data elements we've used to build the sched_domain and + * sched_group structure so that the subsequent __free_domain_allocs() + * will not free the data we're using. + */ +static void claim_allocations(int cpu, struct sched_domain *sd) +{ + struct sd_data *sdd = sd->private; + + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) + *per_cpu_ptr(sdd->sds, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) + *per_cpu_ptr(sdd->sg, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) + *per_cpu_ptr(sdd->sgc, cpu) = NULL; +} + +#ifdef CONFIG_NUMA +static int sched_domains_numa_levels; +enum numa_topology_type sched_numa_topology_type; +static int *sched_domains_numa_distance; +int sched_max_numa_distance; +static struct cpumask ***sched_domains_numa_masks; +static int sched_domains_curr_level; +#endif + +/* + * SD_flags allowed in topology descriptions. + * + * These flags are purely descriptive of the topology and do not prescribe + * behaviour. Behaviour is artificial and mapped in the below sd_init() + * function: + * + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies + * + * Odd one out, which beside describing the topology has a quirk also + * prescribes the desired behaviour that goes along with it: + * + * SD_ASYM_PACKING - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS \ + (SD_SHARE_CPUCAPACITY | \ + SD_SHARE_PKG_RESOURCES | \ + SD_NUMA | \ + SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ + SD_SHARE_POWERDOMAIN) + +static struct sched_domain * +sd_init(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, + struct sched_domain *child, int cpu) +{ + struct sd_data *sdd = &tl->data; + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + int sd_id, sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; +#endif + + sd_weight = cpumask_weight(tl->mask(cpu)); + + if (tl->sd_flags) + sd_flags = (*tl->sd_flags)(); + if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, + "wrong sd_flags in topology description\n")) + sd_flags &= ~TOPOLOGY_SD_FLAGS; + + *sd = (struct sched_domain){ + .min_interval = sd_weight, + .max_interval = 2*sd_weight, + .busy_factor = 32, + .imbalance_pct = 125, + + .cache_nice_tries = 0, + .busy_idx = 0, + .idle_idx = 0, + .newidle_idx = 0, + .wake_idx = 0, + .forkexec_idx = 0, + + .flags = 1*SD_LOAD_BALANCE + | 1*SD_BALANCE_NEWIDLE + | 1*SD_BALANCE_EXEC + | 1*SD_BALANCE_FORK + | 0*SD_BALANCE_WAKE + | 1*SD_WAKE_AFFINE + | 0*SD_SHARE_CPUCAPACITY + | 0*SD_SHARE_PKG_RESOURCES + | 0*SD_SERIALIZE + | 0*SD_PREFER_SIBLING + | 0*SD_NUMA + | sd_flags + , + + .last_balance = jiffies, + .balance_interval = sd_weight, + .smt_gain = 0, + .max_newidle_lb_cost = 0, + .next_decay_max_lb_cost = jiffies, + .child = child, +#ifdef CONFIG_SCHED_DEBUG + .name = tl->name, +#endif + }; + + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + sd_id = cpumask_first(sched_domain_span(sd)); + + /* + * Convert topological properties into behaviour. + */ + + if (sd->flags & SD_ASYM_CPUCAPACITY) { + struct sched_domain *t = sd; + + for_each_lower_domain(t) + t->flags |= SD_BALANCE_WAKE; + } + + if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->flags |= SD_PREFER_SIBLING; + sd->imbalance_pct = 110; + sd->smt_gain = 1178; /* ~15% */ + + } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->imbalance_pct = 117; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + +#ifdef CONFIG_NUMA + } else if (sd->flags & SD_NUMA) { + sd->cache_nice_tries = 2; + sd->busy_idx = 3; + sd->idle_idx = 2; + + sd->flags |= SD_SERIALIZE; + if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + sd->flags &= ~(SD_BALANCE_EXEC | + SD_BALANCE_FORK | + SD_WAKE_AFFINE); + } + +#endif + } else { + sd->flags |= SD_PREFER_SIBLING; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + sd->idle_idx = 1; + } + + /* + * For all levels sharing cache; connect a sched_domain_shared + * instance. + */ + if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->shared = *per_cpu_ptr(sdd->sds, sd_id); + atomic_inc(&sd->shared->ref); + atomic_set(&sd->shared->nr_busy_cpus, sd_weight); + } + + sd->private = sdd; + + return sd; +} + +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +static struct sched_domain_topology_level *sched_domain_topology = + default_topology; + +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ + if (WARN_ON_ONCE(sched_smp_initialized)) + return; + + sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA + +static const struct cpumask *sd_numa_mask(int cpu) +{ + return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; +} + +static void sched_numa_warn(const char *str) +{ + static int done = false; + int i,j; + + if (done) + return; + + done = true; + + printk(KERN_WARNING "ERROR: %s\n\n", str); + + for (i = 0; i < nr_node_ids; i++) { + printk(KERN_WARNING " "); + for (j = 0; j < nr_node_ids; j++) + printk(KERN_CONT "%02d ", node_distance(i,j)); + printk(KERN_CONT "\n"); + } + printk(KERN_WARNING "\n"); +} + +bool find_numa_distance(int distance) +{ + int i; + + if (distance == node_distance(0, 0)) + return true; + + for (i = 0; i < sched_domains_numa_levels; i++) { + if (sched_domains_numa_distance[i] == distance) + return true; + } + + return false; +} + +/* + * A system can have three types of NUMA topology: + * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system + * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes + * NUMA_BACKPLANE: nodes can reach other nodes through a backplane + * + * The difference between a glueless mesh topology and a backplane + * topology lies in whether communication between not directly + * connected nodes goes through intermediary nodes (where programs + * could run), or through backplane controllers. This affects + * placement of programs. + * + * The type of topology can be discerned with the following tests: + * - If the maximum distance between any nodes is 1 hop, the system + * is directly connected. + * - If for two nodes A and B, located N > 1 hops away from each other, + * there is an intermediary node C, which is < N hops away from both + * nodes A and B, the system is a glueless mesh. + */ +static void init_numa_topology_type(void) +{ + int a, b, c, n; + + n = sched_max_numa_distance; + + if (sched_domains_numa_levels <= 1) { + sched_numa_topology_type = NUMA_DIRECT; + return; + } + + for_each_online_node(a) { + for_each_online_node(b) { + /* Find two nodes furthest removed from each other. */ + if (node_distance(a, b) < n) + continue; + + /* Is there an intermediary node between a and b? */ + for_each_online_node(c) { + if (node_distance(a, c) < n && + node_distance(b, c) < n) { + sched_numa_topology_type = + NUMA_GLUELESS_MESH; + return; + } + } + + sched_numa_topology_type = NUMA_BACKPLANE; + return; + } + } +} + +void sched_init_numa(void) +{ + int next_distance, curr_distance = node_distance(0, 0); + struct sched_domain_topology_level *tl; + int level = 0; + int i, j, k; + + sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); + if (!sched_domains_numa_distance) + return; + + /* + * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * unique distances in the node_distance() table. + * + * Assumes node_distance(0,j) includes all distances in + * node_distance(i,j) in order to avoid cubic time. + */ + next_distance = curr_distance; + for (i = 0; i < nr_node_ids; i++) { + for (j = 0; j < nr_node_ids; j++) { + for (k = 0; k < nr_node_ids; k++) { + int distance = node_distance(i, k); + + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + + /* + * While not a strong assumption it would be nice to know + * about cases where if node A is connected to B, B is not + * equally connected to A. + */ + if (sched_debug() && node_distance(k, i) != distance) + sched_numa_warn("Node-distance not symmetric"); + + if (sched_debug() && i && !find_numa_distance(distance)) + sched_numa_warn("Node-0 not representative"); + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; + } + + /* + * In case of sched_debug() we verify the above assumption. + */ + if (!sched_debug()) + break; + } + + if (!level) + return; + + /* + * 'level' contains the number of unique distances, excluding the + * identity distance node_distance(i,i). + * + * The sched_domains_numa_distance[] array includes the actual distance + * numbers. + */ + + /* + * Here, we should temporarily reset sched_domains_numa_levels to 0. + * If it fails to allocate memory for array sched_domains_numa_masks[][], + * the array will contain less then 'level' members. This could be + * dangerous when we use it to iterate array sched_domains_numa_masks[][] + * in other functions. + * + * We reset it to 'level' at the end of this function. + */ + sched_domains_numa_levels = 0; + + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + if (!sched_domains_numa_masks) + return; + + /* + * Now for each level, construct a mask per node which contains all + * CPUs of nodes that are that many hops away from us. + */ + for (i = 0; i < level; i++) { + sched_domains_numa_masks[i] = + kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!sched_domains_numa_masks[i]) + return; + + for (j = 0; j < nr_node_ids; j++) { + struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!mask) + return; + + sched_domains_numa_masks[i][j] = mask; + + for_each_node(k) { + if (node_distance(j, k) > sched_domains_numa_distance[i]) + continue; + + cpumask_or(mask, mask, cpumask_of_node(k)); + } + } + } + + /* Compute default topology size */ + for (i = 0; sched_domain_topology[i].mask; i++); + + tl = kzalloc((i + level + 1) * + sizeof(struct sched_domain_topology_level), GFP_KERNEL); + if (!tl) + return; + + /* + * Copy the default topology bits.. + */ + for (i = 0; sched_domain_topology[i].mask; i++) + tl[i] = sched_domain_topology[i]; + + /* + * .. and append 'j' levels of NUMA goodness. + */ + for (j = 0; j < level; i++, j++) { + tl[i] = (struct sched_domain_topology_level){ + .mask = sd_numa_mask, + .sd_flags = cpu_numa_flags, + .flags = SDTL_OVERLAP, + .numa_level = j, + SD_INIT_NAME(NUMA) + }; + } + + sched_domain_topology = tl; + + sched_domains_numa_levels = level; + sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + + init_numa_topology_type(); +} + +void sched_domains_numa_masks_set(unsigned int cpu) +{ + int node = cpu_to_node(cpu); + int i, j; + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (node_distance(j, node) <= sched_domains_numa_distance[i]) + cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); + } + } +} + +void sched_domains_numa_masks_clear(unsigned int cpu) +{ + int i, j; + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } +} + +#endif /* CONFIG_NUMA */ + +static int __sdt_alloc(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for_each_sd_topology(tl) { + struct sd_data *sdd = &tl->data; + + sdd->sd = alloc_percpu(struct sched_domain *); + if (!sdd->sd) + return -ENOMEM; + + sdd->sds = alloc_percpu(struct sched_domain_shared *); + if (!sdd->sds) + return -ENOMEM; + + sdd->sg = alloc_percpu(struct sched_group *); + if (!sdd->sg) + return -ENOMEM; + + sdd->sgc = alloc_percpu(struct sched_group_capacity *); + if (!sdd->sgc) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + struct sched_domain_shared *sds; + struct sched_group *sg; + struct sched_group_capacity *sgc; + + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sd) + return -ENOMEM; + + *per_cpu_ptr(sdd->sd, j) = sd; + + sds = kzalloc_node(sizeof(struct sched_domain_shared), + GFP_KERNEL, cpu_to_node(j)); + if (!sds) + return -ENOMEM; + + *per_cpu_ptr(sdd->sds, j) = sds; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sg) + return -ENOMEM; + + sg->next = sg; + + *per_cpu_ptr(sdd->sg, j) = sg; + + sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sgc) + return -ENOMEM; + + *per_cpu_ptr(sdd->sgc, j) = sgc; + } + } + + return 0; +} + +static void __sdt_free(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for_each_sd_topology(tl) { + struct sd_data *sdd = &tl->data; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + + if (sdd->sd) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); + } + + if (sdd->sds) + kfree(*per_cpu_ptr(sdd->sds, j)); + if (sdd->sg) + kfree(*per_cpu_ptr(sdd->sg, j)); + if (sdd->sgc) + kfree(*per_cpu_ptr(sdd->sgc, j)); + } + free_percpu(sdd->sd); + sdd->sd = NULL; + free_percpu(sdd->sds); + sdd->sds = NULL; + free_percpu(sdd->sg); + sdd->sg = NULL; + free_percpu(sdd->sgc); + sdd->sgc = NULL; + } +} + +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *child, int cpu) +{ + struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); + + if (child) { + sd->level = child->level + 1; + sched_domain_level_max = max(sched_domain_level_max, sd->level); + child->parent = sd; + + if (!cpumask_subset(sched_domain_span(child), + sched_domain_span(sd))) { + pr_err("BUG: arch topology borken\n"); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" the %s domain not a subset of the %s domain\n", + child->name, sd->name); +#endif + /* Fixup, ensure @sd has at least @child cpus. */ + cpumask_or(sched_domain_span(sd), + sched_domain_span(sd), + sched_domain_span(child)); + } + + } + set_domain_attribute(sd, attr); + + return sd; +} + +/* + * Build sched domains for a given set of CPUs and attach the sched domains + * to the individual CPUs + */ +static int +build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) +{ + enum s_alloc alloc_state; + struct sched_domain *sd; + struct s_data d; + struct rq *rq = NULL; + int i, ret = -ENOMEM; + + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); + if (alloc_state != sa_rootdomain) + goto error; + + /* Set up domains for CPUs specified by the cpu_map: */ + for_each_cpu(i, cpu_map) { + struct sched_domain_topology_level *tl; + + sd = NULL; + for_each_sd_topology(tl) { + sd = build_sched_domain(tl, cpu_map, attr, sd, i); + if (tl == sched_domain_topology) + *per_cpu_ptr(d.sd, i) = sd; + if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + sd->flags |= SD_OVERLAP; + if (cpumask_equal(cpu_map, sched_domain_span(sd))) + break; + } + } + + /* Build the groups for the domains */ + for_each_cpu(i, cpu_map) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + sd->span_weight = cpumask_weight(sched_domain_span(sd)); + if (sd->flags & SD_OVERLAP) { + if (build_overlap_sched_groups(sd, i)) + goto error; + } else { + if (build_sched_groups(sd, i)) + goto error; + } + } + } + + /* Calculate CPU capacity for physical packages and nodes */ + for (i = nr_cpumask_bits-1; i >= 0; i--) { + if (!cpumask_test_cpu(i, cpu_map)) + continue; + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + claim_allocations(i, sd); + init_sched_groups_capacity(i, sd); + } + } + + /* Attach the domains */ + rcu_read_lock(); + for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); + sd = *per_cpu_ptr(d.sd, i); + + /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ + if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) + WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); + + cpu_attach_domain(sd, d.rd, i); + } + rcu_read_unlock(); + + if (rq && sched_debug_enabled) { + pr_info("span: %*pbl (max cpu_capacity = %lu)\n", + cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); + } + + ret = 0; +error: + __free_domain_allocs(&d, alloc_state, cpu_map); + return ret; +} + +/* Current sched domains: */ +static cpumask_var_t *doms_cur; + +/* Number of sched domains in 'doms_cur': */ +static int ndoms_cur; + +/* Attribues of custom domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; + +/* + * Special case: If a kmalloc() of a doms_cur partition (array of + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. + */ +cpumask_var_t fallback_doms; + +/* + * arch_update_cpu_topology lets virtualized architectures update the + * CPU core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __weak arch_update_cpu_topology(void) +{ + return 0; +} + +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ + int i; + cpumask_var_t *doms; + + doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + if (!doms) + return NULL; + for (i = 0; i < ndoms; i++) { + if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { + free_sched_domains(doms, i); + return NULL; + } + } + return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ + unsigned int i; + for (i = 0; i < ndoms; i++) + free_cpumask_var(doms[i]); + kfree(doms); +} + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * For now this just excludes isolated CPUs, but could be used to + * exclude other special cases in the future. + */ +int init_sched_domains(const struct cpumask *cpu_map) +{ + int err; + + arch_update_cpu_topology(); + ndoms_cur = 1; + doms_cur = alloc_sched_domains(ndoms_cur); + if (!doms_cur) + doms_cur = &fallback_doms; + cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); + err = build_sched_domains(doms_cur[0], NULL); + register_sched_domain_sysctl(); + + return err; +} + +/* + * Detach sched domains from a group of CPUs specified in cpu_map + * These CPUs will now be attached to the NULL domain + */ +static void detach_destroy_domains(const struct cpumask *cpu_map) +{ + int i; + + rcu_read_lock(); + for_each_cpu(i, cpu_map) + cpu_attach_domain(NULL, &def_root_domain, i); + rcu_read_unlock(); +} + +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, + struct sched_domain_attr *new, int idx_new) +{ + struct sched_domain_attr tmp; + + /* Fast path: */ + if (!new && !cur) + return 1; + + tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, + new ? (new + idx_new) : &tmp, + sizeof(struct sched_domain_attr)); +} + +/* + * Partition sched domains as specified by the 'ndoms_new' + * cpumasks in the array doms_new[] of cpumasks. This compares + * doms_new[] to the current sched domain partitioning, doms_cur[]. + * It destroys each deleted domain and builds each new domain. + * + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. + * The masks don't intersect (don't overlap.) We should setup one + * sched domain for each mask. CPUs not in any of the cpumasks will + * not be load balanced. If the same cpumask appears both in the + * current 'doms_cur' domains and in the new 'doms_new', we can leave + * it as it is. + * + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains. This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. + * + * If doms_new == NULL it will be replaced with cpu_online_mask. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. + * + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + int i, j, n; + int new_topology; + + mutex_lock(&sched_domains_mutex); + + /* Always unregister in case we don't destroy any domains: */ + unregister_sched_domain_sysctl(); + + /* Let the architecture update CPU core mappings: */ + new_topology = arch_update_cpu_topology(); + + n = doms_new ? ndoms_new : 0; + + /* Destroy deleted domains: */ + for (i = 0; i < ndoms_cur; i++) { + for (j = 0; j < n && !new_topology; j++) { + if (cpumask_equal(doms_cur[i], doms_new[j]) + && dattrs_equal(dattr_cur, i, dattr_new, j)) + goto match1; + } + /* No match - a current sched domain not in new doms_new[] */ + detach_destroy_domains(doms_cur[i]); +match1: + ; + } + + n = ndoms_cur; + if (doms_new == NULL) { + n = 0; + doms_new = &fallback_doms; + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + WARN_ON_ONCE(dattr_new); + } + + /* Build new domains: */ + for (i = 0; i < ndoms_new; i++) { + for (j = 0; j < n && !new_topology; j++) { + if (cpumask_equal(doms_new[i], doms_cur[j]) + && dattrs_equal(dattr_new, i, dattr_cur, j)) + goto match2; + } + /* No match - add a new doms_new */ + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); +match2: + ; + } + + /* Remember the new sched domains: */ + if (doms_cur != &fallback_doms) + free_sched_domains(doms_cur, ndoms_cur); + + kfree(dattr_cur); + doms_cur = doms_new; + dattr_cur = dattr_new; + ndoms_cur = ndoms_new; + + register_sched_domain_sysctl(); + + mutex_unlock(&sched_domains_mutex); +} + diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 9453efe9b25a..b8c84c6dee64 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -5,7 +5,8 @@ */ #include <linux/init.h> #include <linux/export.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> #include <linux/mm.h> #include <linux/wait.h> #include <linux/hash.h> @@ -241,6 +242,45 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_event); +/* + * Note! These two wait functions are entered with the + * wait-queue lock held (and interrupts off in the _irq + * case), so there is no race with testing the wakeup + * condition in the caller before they add the wait + * entry to the wake queue. + */ +int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) +{ + if (likely(list_empty(&wait->task_list))) + __add_wait_queue_tail(wq, wait); + + set_current_state(TASK_INTERRUPTIBLE); + if (signal_pending(current)) + return -ERESTARTSYS; + + spin_unlock(&wq->lock); + schedule(); + spin_lock(&wq->lock); + return 0; +} +EXPORT_SYMBOL(do_wait_intr); + +int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait) +{ + if (likely(list_empty(&wait->task_list))) + __add_wait_queue_tail(wq, wait); + + set_current_state(TASK_INTERRUPTIBLE); + if (signal_pending(current)) + return -ERESTARTSYS; + + spin_unlock_irq(&wq->lock); + schedule(); + spin_lock_irq(&wq->lock); + return 0; +} +EXPORT_SYMBOL(do_wait_intr_irq); + /** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f7ce79a46050..65f61077ad50 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -16,7 +16,9 @@ #include <linux/atomic.h> #include <linux/audit.h> #include <linux/compat.h> +#include <linux/coredump.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/seccomp.h> #include <linux/slab.h> #include <linux/syscalls.h> @@ -486,6 +488,17 @@ void put_seccomp_filter(struct task_struct *tsk) } } +static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) +{ + memset(info, 0, sizeof(*info)); + info->si_signo = SIGSYS; + info->si_code = SYS_SECCOMP; + info->si_call_addr = (void __user *)KSTK_EIP(current); + info->si_errno = reason; + info->si_arch = syscall_get_arch(); + info->si_syscall = syscall; +} + /** * seccomp_send_sigsys - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland @@ -496,13 +509,7 @@ void put_seccomp_filter(struct task_struct *tsk) static void seccomp_send_sigsys(int syscall, int reason) { struct siginfo info; - memset(&info, 0, sizeof(info)); - info.si_signo = SIGSYS; - info.si_code = SYS_SECCOMP; - info.si_call_addr = (void __user *)KSTK_EIP(current); - info.si_errno = reason; - info.si_arch = syscall_get_arch(); - info.si_syscall = syscall; + seccomp_init_siginfo(&info, syscall, reason); force_sig_info(SIGSYS, &info, current); } #endif /* CONFIG_SECCOMP_FILTER */ @@ -634,10 +641,20 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_KILL: - default: + default: { + siginfo_t info; audit_seccomp(this_syscall, SIGSYS, action); + /* Dump core only if this is the last remaining thread. */ + if (get_nr_threads(current) == 1) { + /* Show the original registers in the dump. */ + syscall_rollback(current, task_pt_regs(current)); + /* Trigger a manual coredump since do_exit skips it. */ + seccomp_init_siginfo(&info, this_syscall, data); + do_coredump(&info); + } do_exit(SIGSYS); } + } unreachable(); diff --git a/kernel/signal.c b/kernel/signal.c index 3603d93a1968..ca92bcfeb322 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -13,7 +13,12 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/init.h> -#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/user.h> +#include <linux/sched/debug.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/cputime.h> #include <linux/fs.h> #include <linux/tty.h> #include <linux/binfmts.h> @@ -1232,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, } /* * This sighand can be already freed and even reused, but - * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which + * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which * initializes ->siglock: this slab can't go away, it has * the same object type, ->siglock can't be reinitialized. * @@ -1313,7 +1318,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) } } -int kill_proc_info(int sig, struct siginfo *info, pid_t pid) +static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) { int error; rcu_read_lock(); @@ -1581,7 +1586,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) unsigned long flags; struct sighand_struct *psig; bool autoreap = false; - cputime_t utime, stime; + u64 utime, stime; BUG_ON(sig == -1); @@ -1620,8 +1625,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) rcu_read_unlock(); task_cputime(tsk, &utime, &stime); - info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime); - info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime); + info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime); + info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) @@ -1685,7 +1690,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; - cputime_t utime, stime; + u64 utime, stime; if (for_ptracer) { parent = tsk->parent; @@ -1705,8 +1710,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, rcu_read_unlock(); task_cputime(tsk, &utime, &stime); - info.si_utime = cputime_to_clock_t(utime); - info.si_stime = cputime_to_clock_t(stime); + info.si_utime = nsec_to_clock_t(utime); + info.si_stime = nsec_to_clock_t(stime); info.si_code = why; switch (why) { @@ -2395,11 +2400,11 @@ void exit_signals(struct task_struct *tsk) * @tsk is about to have PF_EXITING set - lock out users which * expect stable threadgroup. */ - threadgroup_change_begin(tsk); + cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { tsk->flags |= PF_EXITING; - threadgroup_change_end(tsk); + cgroup_threadgroup_change_end(tsk); return; } @@ -2410,7 +2415,7 @@ void exit_signals(struct task_struct *tsk) */ tsk->flags |= PF_EXITING; - threadgroup_change_end(tsk); + cgroup_threadgroup_change_end(tsk); if (!signal_pending(tsk)) goto out; @@ -3239,10 +3244,17 @@ int compat_restore_altstack(const compat_stack_t __user *uss) int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) { + int err; struct task_struct *t = current; - return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) | - __put_user(sas_ss_flags(sp), &uss->ss_flags) | + err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), + &uss->ss_sp) | + __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); + if (err) + return err; + if (t->sas_ss_flags & SS_AUTODISARM) + sas_ss_reset(t); + return 0; } #endif diff --git a/kernel/smp.c b/kernel/smp.c index 77fcdb9f2775..a817769b53c0 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -17,6 +17,7 @@ #include <linux/smp.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/sched/idle.h> #include <linux/hypervisor.h> #include "smpboot.h" diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 4a5c6e73ecd4..1d71c051a951 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -9,6 +9,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/task.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/kthread.h> diff --git a/kernel/softirq.c b/kernel/softirq.c index 744fa611cae0..4e09821f9d9e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -309,7 +309,7 @@ restart: account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); - tsk_restore_flags(current, old_flags, PF_MEMALLOC); + current_restore_flags(old_flags, PF_MEMALLOC); } asmlinkage __visible void do_softirq(void) diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9c15a9124e83..f8edee9c792d 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -54,8 +54,8 @@ int snprint_stack_trace(char *buf, size_t size, EXPORT_SYMBOL_GPL(snprint_stack_trace); /* - * Architectures that do not implement save_stack_trace_tsk or - * save_stack_trace_regs get this weak alias and a once-per-bootup warning + * Architectures that do not implement save_stack_trace_*() + * get these weak aliases and once-per-bootup warnings * (whenever this facility is utilized - for example by procfs): */ __weak void @@ -69,3 +69,11 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); } + +__weak int +save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) +{ + WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); + return -ENOSYS; +} diff --git a/kernel/sys.c b/kernel/sys.c index 842914ef7de4..8a94b4eabcaa 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -49,6 +49,13 @@ #include <linux/binfmts.h> #include <linux/sched.h> +#include <linux/sched/autogroup.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/stat.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> +#include <linux/sched/task.h> +#include <linux/sched/cputime.h> #include <linux/rcupdate.h> #include <linux/uidgid.h> #include <linux/cred.h> @@ -881,15 +888,15 @@ SYSCALL_DEFINE0(getegid) void do_sys_times(struct tms *tms) { - cputime_t tgutime, tgstime, cutime, cstime; + u64 tgutime, tgstime, cutime, cstime; thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; - tms->tms_utime = cputime_to_clock_t(tgutime); - tms->tms_stime = cputime_to_clock_t(tgstime); - tms->tms_cutime = cputime_to_clock_t(cutime); - tms->tms_cstime = cputime_to_clock_t(cstime); + tms->tms_utime = nsec_to_clock_t(tgutime); + tms->tms_stime = nsec_to_clock_t(tgstime); + tms->tms_cutime = nsec_to_clock_t(cutime); + tms->tms_cstime = nsec_to_clock_t(cstime); } SYSCALL_DEFINE1(times, struct tms __user *, tbuf) @@ -1389,8 +1396,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, !capable(CAP_SYS_RESOURCE)) retval = -EPERM; if (!retval) - retval = security_task_setrlimit(tsk->group_leader, - resource, new_rlim); + retval = security_task_setrlimit(tsk, resource, new_rlim); if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { /* * The caller is asking for an immediate RLIMIT_CPU @@ -1425,25 +1431,26 @@ out: } /* rcu lock must be held */ -static int check_prlimit_permission(struct task_struct *task) +static int check_prlimit_permission(struct task_struct *task, + unsigned int flags) { const struct cred *cred = current_cred(), *tcred; + bool id_match; if (current == task) return 0; tcred = __task_cred(task); - if (uid_eq(cred->uid, tcred->euid) && - uid_eq(cred->uid, tcred->suid) && - uid_eq(cred->uid, tcred->uid) && - gid_eq(cred->gid, tcred->egid) && - gid_eq(cred->gid, tcred->sgid) && - gid_eq(cred->gid, tcred->gid)) - return 0; - if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) - return 0; + id_match = (uid_eq(cred->uid, tcred->euid) && + uid_eq(cred->uid, tcred->suid) && + uid_eq(cred->uid, tcred->uid) && + gid_eq(cred->gid, tcred->egid) && + gid_eq(cred->gid, tcred->sgid) && + gid_eq(cred->gid, tcred->gid)); + if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) + return -EPERM; - return -EPERM; + return security_task_prlimit(cred, tcred, flags); } SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, @@ -1453,12 +1460,17 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, struct rlimit64 old64, new64; struct rlimit old, new; struct task_struct *tsk; + unsigned int checkflags = 0; int ret; + if (old_rlim) + checkflags |= LSM_PRLIMIT_READ; + if (new_rlim) { if (copy_from_user(&new64, new_rlim, sizeof(new64))) return -EFAULT; rlim64_to_rlim(&new64, &new); + checkflags |= LSM_PRLIMIT_WRITE; } rcu_read_lock(); @@ -1467,7 +1479,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, rcu_read_unlock(); return -ESRCH; } - ret = check_prlimit_permission(tsk); + ret = check_prlimit_permission(tsk, checkflags); if (ret) { rcu_read_unlock(); return ret; @@ -1544,7 +1556,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; - cputime_t tgutime, tgstime, utime, stime; + u64 tgutime, tgstime, utime, stime; unsigned long maxrss = 0; memset((char *)r, 0, sizeof (*r)); @@ -1600,8 +1612,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) unlock_task_sighand(p, &flags); out: - cputime_to_timeval(utime, &r->ru_utime); - cputime_to_timeval(stime, &r->ru_stime); + r->ru_utime = ns_to_timeval(utime); + r->ru_stime = ns_to_timeval(stime); if (who != RUSAGE_CHILDREN) { struct mm_struct *mm = get_task_mm(p); @@ -2063,6 +2075,24 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) } #endif +static int propagate_has_child_subreaper(struct task_struct *p, void *data) +{ + /* + * If task has has_child_subreaper - all its decendants + * already have these flag too and new decendants will + * inherit it on fork, skip them. + * + * If we've found child_reaper - skip descendants in + * it's subtree as they will never get out pidns. + */ + if (p->signal->has_child_subreaper || + is_child_reaper(task_pid(p))) + return 0; + + p->signal->has_child_subreaper = 1; + return 1; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2214,6 +2244,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; + if (!arg2) + break; + + walk_process_tree(me, propagate_has_child_subreaper, NULL); break; case PR_GET_CHILD_SUBREAPER: error = put_user(me->signal->is_child_subreaper, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1aea594a54db..4dfba1a76cc3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -63,6 +63,7 @@ #include <linux/capability.h> #include <linux/binfmts.h> #include <linux/sched/sysctl.h> +#include <linux/sched/coredump.h> #include <linux/kexec.h> #include <linux/bpf.h> #include <linux/mount.h> @@ -416,7 +417,7 @@ static struct ctl_table kern_table[] = { }, { .procname = "sched_rr_timeslice_ms", - .data = &sched_rr_timeslice, + .data = &sysctl_sched_rr_timeslice, .maxlen = sizeof(int), .mode = 0644, .proc_handler = sched_rr_handler, @@ -1175,6 +1176,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, + .extra1 = &zero, + .extra2 = &one, }, #endif #ifdef CONFIG_BPF_SYSCALL @@ -2132,9 +2135,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, if (write) { if (*negp) return -EINVAL; + if (*lvalp > UINT_MAX) + return -EINVAL; *valp = *lvalp; } else { unsigned int val = *valp; + *negp = false; *lvalp = (unsigned long)val; } return 0; @@ -2570,7 +2576,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - if (*lvalp > LONG_MAX / HZ) + if (*lvalp > INT_MAX / HZ) return 1; *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); } else { diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 8a5e44236f78..4559e914452b 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -30,6 +30,7 @@ #include <linux/pid_namespace.h> #include <net/genetlink.h> #include <linux/atomic.h> +#include <linux/sched/cputime.h> /* * Maximum length of a cpumask that can be specified in @@ -210,6 +211,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) struct task_struct *tsk, *first; unsigned long flags; int rc = -ESRCH; + u64 delta, utime, stime; + u64 start_time; /* * Add additional stats from live tasks except zombie thread group @@ -227,6 +230,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) memset(stats, 0, sizeof(*stats)); tsk = first; + start_time = ktime_get_ns(); do { if (tsk->exit_state) continue; @@ -238,6 +242,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) */ delayacct_add_tsk(stats, tsk); + /* calculate task elapsed time in nsec */ + delta = start_time - tsk->start_time; + /* Convert to micro seconds */ + do_div(delta, NSEC_PER_USEC); + stats->ac_etime += delta; + + task_cputime(tsk, &utime, &stime); + stats->ac_utime += div_u64(utime, NSEC_PER_USEC); + stats->ac_stime += div_u64(stime, NSEC_PER_USEC); + stats->nvcsw += tsk->nvcsw; stats->nivcsw += tsk->nivcsw; } while_each_thread(first, tsk); diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 976840d29a71..938dbf33ef49 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -15,6 +15,5 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o -obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index e6dc9a538efa..5cb5b0008d97 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -19,6 +19,8 @@ #include <linux/hrtimer.h> #include <linux/timerqueue.h> #include <linux/rtc.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> #include <linux/alarmtimer.h> #include <linux/mutex.h> #include <linux/platform_device.h> @@ -539,7 +541,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, * * Returns the granularity of underlying alarm base clock */ -static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) +static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { if (!alarmtimer_get_rtcdev()) return -EINVAL; @@ -556,14 +558,14 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) * * Provides the underlying alarm base time. */ -static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) +static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; if (!alarmtimer_get_rtcdev()) return -EINVAL; - *tp = ktime_to_timespec(base->gettime()); + *tp = ktime_to_timespec64(base->gettime()); return 0; } @@ -596,19 +598,19 @@ static int alarm_timer_create(struct k_itimer *new_timer) * Copies out the current itimerspec data */ static void alarm_timer_get(struct k_itimer *timr, - struct itimerspec *cur_setting) + struct itimerspec64 *cur_setting) { ktime_t relative_expiry_time = alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); if (ktime_to_ns(relative_expiry_time) > 0) { - cur_setting->it_value = ktime_to_timespec(relative_expiry_time); + cur_setting->it_value = ktime_to_timespec64(relative_expiry_time); } else { cur_setting->it_value.tv_sec = 0; cur_setting->it_value.tv_nsec = 0; } - cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); + cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval); } /** @@ -638,8 +640,8 @@ static int alarm_timer_del(struct k_itimer *timr) * Sets the timer to new_setting, and starts the timer. */ static int alarm_timer_set(struct k_itimer *timr, int flags, - struct itimerspec *new_setting, - struct itimerspec *old_setting) + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting) { ktime_t exp; @@ -657,8 +659,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, return TIMER_RETRY; /* start the timer */ - timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); - exp = timespec_to_ktime(new_setting->it_value); + timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); + exp = timespec64_to_ktime(new_setting->it_value); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now; @@ -788,13 +790,14 @@ out: * Handles clock_nanosleep calls against _ALARM clockids */ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsreq, struct timespec __user *rmtp) + struct timespec64 *tsreq, + struct timespec __user *rmtp) { enum alarmtimer_type type = clock2alarm(which_clock); + struct restart_block *restart; struct alarm alarm; ktime_t exp; int ret = 0; - struct restart_block *restart; if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -807,7 +810,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); - exp = timespec_to_ktime(*tsreq); + exp = timespec64_to_ktime(*tsreq); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now = alarm_bases[type].gettime(); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 97ac0951f164..4237e0744e26 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); -void clockevents_config(struct clock_event_device *dev, u32 freq) +static void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 665985b0a89a..93621ae718d3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -141,6 +141,10 @@ static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; + + if (cs->mark_unstable) + cs->mark_unstable(cs); + if (finished_booting) schedule_work(&watchdog_work); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c6ecedd3b839..ac053bb5296e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -43,10 +43,12 @@ #include <linux/seq_file.h> #include <linux/err.h> #include <linux/debugobjects.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> +#include <linux/sched/nohz.h> +#include <linux/sched/debug.h> #include <linux/timer.h> #include <linux/freezer.h> @@ -94,17 +96,15 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + /* Make sure we catch unsupported clockids */ + [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; -static inline int hrtimer_clockid_to_base(clockid_t clock_id) -{ - return hrtimer_clock_to_base_table[clock_id]; -} - /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -766,34 +766,6 @@ void hrtimers_resume(void) clock_was_set_delayed(); } -static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - if (timer->start_site) - return; - timer->start_site = __builtin_return_address(0); - memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); - timer->start_pid = current->pid; -#endif -} - -static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; -#endif -} - -static inline void timer_stats_account_hrtimer(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - if (likely(!timer_stats_active)) - return; - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, 0); -#endif -} - /* * Counterpart to lock_hrtimer_base above: */ @@ -932,7 +904,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest * rare case and less expensive than a smp call. */ debug_deactivate(timer); - timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); if (!restart) @@ -990,8 +961,6 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - timer_stats_hrtimer_set_start_info(timer); - leftmost = enqueue_hrtimer(timer, new_base); if (!leftmost) goto unlock; @@ -1018,7 +987,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); * Returns: * 0 when the timer was not active * 1 when the timer was active - * -1 when the timer is currently excuting the callback function and + * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) @@ -1112,6 +1081,18 @@ u64 hrtimer_get_next_event(void) } #endif +static inline int hrtimer_clockid_to_base(clockid_t clock_id) +{ + if (likely(clock_id < MAX_CLOCKS)) { + int base = hrtimer_clock_to_base_table[clock_id]; + + if (likely(base != HRTIMER_MAX_CLOCK_BASES)) + return base; + } + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; +} + static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { @@ -1128,12 +1109,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base = hrtimer_clockid_to_base(clock_id); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); - -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; - timer->start_pid = -1; - memset(timer->start_comm, 0, TASK_COMM_LEN); -#endif } /** @@ -1217,7 +1192,6 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, raw_write_seqcount_barrier(&cpu_base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); - timer_stats_account_hrtimer(timer); fn = timer->function; /* @@ -1394,10 +1368,7 @@ retry: ktime_to_ns(delta)); } -/* - * local version of hrtimer_peek_ahead_timers() called with interrupts - * disabled. - */ +/* called with interrupts disabled */ static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; @@ -1532,7 +1503,7 @@ out: return ret; } -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, +long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; @@ -1545,7 +1516,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, slack = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); + hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); if (do_nanosleep(&t, mode)) goto out; @@ -1576,15 +1547,17 @@ out: SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, struct timespec __user *, rmtp) { + struct timespec64 tu64; struct timespec tu; if (copy_from_user(&tu, rqtp, sizeof(tu))) return -EFAULT; - if (!timespec_valid(&tu)) + tu64 = timespec_to_timespec64(tu); + if (!timespec64_valid(&tu64)) return -EINVAL; - return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } /* diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 8c89143f9ebf..087d6a1279b8 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -10,6 +10,8 @@ #include <linux/interrupt.h> #include <linux/syscalls.h> #include <linux/time.h> +#include <linux/sched/signal.h> +#include <linux/sched/cputime.h> #include <linux/posix-timers.h> #include <linux/hrtimer.h> #include <trace/events/timer.h> @@ -45,16 +47,16 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, struct itimerval *const value) { - cputime_t cval, cinterval; + u64 val, interval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; spin_lock_irq(&tsk->sighand->siglock); - cval = it->expires; - cinterval = it->incr; - if (cval) { + val = it->expires; + interval = it->incr; + if (val) { struct task_cputime cputime; - cputime_t t; + u64 t; thread_group_cputimer(tsk, &cputime); if (clock_id == CPUCLOCK_PROF) @@ -63,17 +65,17 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, /* CPUCLOCK_VIRT */ t = cputime.utime; - if (cval < t) + if (val < t) /* about to fire */ - cval = cputime_one_jiffy; + val = TICK_NSEC; else - cval = cval - t; + val -= t; } spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + value->it_value = ns_to_timeval(val); + value->it_interval = ns_to_timeval(interval); } int do_getitimer(int which, struct itimerval *value) @@ -129,55 +131,35 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) -{ - struct timespec ts; - s64 cpu_ns; - - cputime_to_timespec(ct, &ts); - cpu_ns = timespec_to_ns(&ts); - - return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; -} - static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, const struct itimerval *const value, struct itimerval *const ovalue) { - cputime_t cval, nval, cinterval, ninterval; - s64 ns_ninterval, ns_nval; - u32 error, incr_error; + u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; - nval = timeval_to_cputime(&value->it_value); - ns_nval = timeval_to_ns(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - ns_ninterval = timeval_to_ns(&value->it_interval); - - error = cputime_sub_ns(nval, ns_nval); - incr_error = cputime_sub_ns(ninterval, ns_ninterval); + nval = timeval_to_ns(&value->it_value); + ninterval = timeval_to_ns(&value->it_interval); spin_lock_irq(&tsk->sighand->siglock); - cval = it->expires; - cinterval = it->incr; - if (cval || nval) { + oval = it->expires; + ointerval = it->incr; + if (oval || nval) { if (nval > 0) - nval += cputime_one_jiffy; - set_process_cpu_timer(tsk, clock_id, &nval, &cval); + nval += TICK_NSEC; + set_process_cpu_timer(tsk, clock_id, &nval, &oval); } it->expires = nval; it->incr = ninterval; - it->error = error; - it->incr_error = incr_error; trace_itimer_state(clock_id == CPUCLOCK_VIRT ? ITIMER_VIRTUAL : ITIMER_PROF, value, nval); spin_unlock_irq(&tsk->sighand->siglock); if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); + ovalue->it_value = ns_to_timeval(oval); + ovalue->it_interval = ns_to_timeval(ointerval); } } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a4a0e478e44d..497719127bf9 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -27,19 +27,8 @@ #include "timekeeping.h" -/* The Jiffies based clocksource is the lowest common - * denominator clock source which should function on - * all systems. It has the same coarse resolution as - * the timer interrupt frequency HZ and it suffers - * inaccuracies caused by missed or lost timer - * interrupts and the inability for the timer - * interrupt hardware to accuratly tick at the - * requested HZ value. It is also not recommended - * for "tick-less" systems. - */ -#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ) -/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier +/* Since jiffies uses a simple TICK_NSEC multiplier * conversion, the .shift value could be zero. However * this would make NTP adjustments impossible as they are * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to @@ -47,8 +36,8 @@ * amount, and give ntp adjustments in units of 1/2^8 * * The value 8 is somewhat carefully chosen, as anything - * larger can result in overflows. NSEC_PER_JIFFY grows as - * HZ shrinks, so values greater than 8 overflow 32bits when + * larger can result in overflows. TICK_NSEC grows as HZ + * shrinks, so values greater than 8 overflow 32bits when * HZ=100. */ #if HZ < 34 @@ -64,12 +53,23 @@ static u64 jiffies_read(struct clocksource *cs) return (u64) jiffies; } +/* + * The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accuratly tick at the + * requested HZ value. It is also not recommended + * for "tick-less" systems. + */ static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, .mask = CLOCKSOURCE_MASK(32), - .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, .max_cycles = 10, }; diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 9cff0ab82b63..31d588d37a17 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -297,7 +297,7 @@ out: return err; } -static int pc_clock_gettime(clockid_t id, struct timespec *ts) +static int pc_clock_gettime(clockid_t id, struct timespec64 *ts) { struct posix_clock_desc cd; int err; @@ -316,7 +316,7 @@ static int pc_clock_gettime(clockid_t id, struct timespec *ts) return err; } -static int pc_clock_getres(clockid_t id, struct timespec *ts) +static int pc_clock_getres(clockid_t id, struct timespec64 *ts) { struct posix_clock_desc cd; int err; @@ -335,7 +335,7 @@ static int pc_clock_getres(clockid_t id, struct timespec *ts) return err; } -static int pc_clock_settime(clockid_t id, const struct timespec *ts) +static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) { struct posix_clock_desc cd; int err; @@ -399,7 +399,7 @@ static int pc_timer_delete(struct k_itimer *kit) return err; } -static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts) { clockid_t id = kit->it_clock; struct posix_clock_desc cd; @@ -414,7 +414,7 @@ static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) } static int pc_timer_settime(struct k_itimer *kit, int flags, - struct itimerspec *ts, struct itimerspec *old) + struct itimerspec64 *ts, struct itimerspec64 *old) { clockid_t id = kit->it_clock; struct posix_clock_desc cd; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e9e8c10f0d9a..1370f067fb51 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -2,7 +2,8 @@ * Implement CPU time clocks for the POSIX clock interface. */ -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/cputime.h> #include <linux/posix-timers.h> #include <linux/errno.h> #include <linux/math64.h> @@ -20,10 +21,10 @@ */ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) { - cputime_t cputime = secs_to_cputime(rlim_new); + u64 nsecs = rlim_new * NSEC_PER_SEC; spin_lock_irq(&task->sighand->siglock); - set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); + set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL); spin_unlock_irq(&task->sighand->siglock); } @@ -50,39 +51,14 @@ static int check_clock(const clockid_t which_clock) return error; } -static inline unsigned long long -timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) -{ - unsigned long long ret; - - ret = 0; /* high half always zero when .cpu used */ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; - } else { - ret = cputime_to_expires(timespec_to_cputime(tp)); - } - return ret; -} - -static void sample_to_timespec(const clockid_t which_clock, - unsigned long long expires, - struct timespec *tp) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) - *tp = ns_to_timespec(expires); - else - cputime_to_timespec((__force cputime_t)expires, tp); -} - /* * Update expiry time from increment, and increase overrun count, * given the current clock sample. */ -static void bump_cpu_timer(struct k_itimer *timer, - unsigned long long now) +static void bump_cpu_timer(struct k_itimer *timer, u64 now) { int i; - unsigned long long delta, incr; + u64 delta, incr; if (timer->it.cpu.incr == 0) return; @@ -122,25 +98,25 @@ static inline int task_cputime_zero(const struct task_cputime *cputime) return 0; } -static inline unsigned long long prof_ticks(struct task_struct *p) +static inline u64 prof_ticks(struct task_struct *p) { - cputime_t utime, stime; + u64 utime, stime; task_cputime(p, &utime, &stime); - return cputime_to_expires(utime + stime); + return utime + stime; } -static inline unsigned long long virt_ticks(struct task_struct *p) +static inline u64 virt_ticks(struct task_struct *p) { - cputime_t utime, stime; + u64 utime, stime; task_cputime(p, &utime, &stime); - return cputime_to_expires(utime); + return utime; } static int -posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { int error = check_clock(which_clock); if (!error) { @@ -159,7 +135,7 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) } static int -posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp) { /* * You can never reset a CPU clock, but we check for other errors @@ -176,8 +152,8 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) /* * Sample a per-thread clock for the given task. */ -static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, - unsigned long long *sample) +static int cpu_clock_sample(const clockid_t which_clock, + struct task_struct *p, u64 *sample) { switch (CPUCLOCK_WHICH(which_clock)) { default: @@ -260,7 +236,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) */ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_struct *p, - unsigned long long *sample) + u64 *sample) { struct task_cputime cputime; @@ -269,11 +245,11 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return -EINVAL; case CPUCLOCK_PROF: thread_group_cputime(p, &cputime); - *sample = cputime_to_expires(cputime.utime + cputime.stime); + *sample = cputime.utime + cputime.stime; break; case CPUCLOCK_VIRT: thread_group_cputime(p, &cputime); - *sample = cputime_to_expires(cputime.utime); + *sample = cputime.utime; break; case CPUCLOCK_SCHED: thread_group_cputime(p, &cputime); @@ -285,10 +261,10 @@ static int cpu_clock_sample_group(const clockid_t which_clock, static int posix_cpu_clock_get_task(struct task_struct *tsk, const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { int err = -EINVAL; - unsigned long long rtn; + u64 rtn; if (CPUCLOCK_PERTHREAD(which_clock)) { if (same_thread_group(tsk, current)) @@ -299,13 +275,13 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, } if (!err) - sample_to_timespec(which_clock, rtn, tp); + *tp = ns_to_timespec64(rtn); return err; } -static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int err = -EINVAL; @@ -453,7 +429,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) cleanup_timers(tsk->signal->cpu_timers); } -static inline int expires_gt(cputime_t expires, cputime_t new_exp) +static inline int expires_gt(u64 expires, u64 new_exp) { return expires == 0 || expires > new_exp; } @@ -488,7 +464,7 @@ static void arm_timer(struct k_itimer *timer) list_add(&nt->entry, listpos); if (listpos == head) { - unsigned long long exp = nt->expires; + u64 exp = nt->expires; /* * We are the new earliest-expiring POSIX 1.b timer, hence @@ -499,16 +475,15 @@ static void arm_timer(struct k_itimer *timer) switch (CPUCLOCK_WHICH(timer->it_clock)) { case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) - cputime_expires->prof_exp = expires_to_cputime(exp); + if (expires_gt(cputime_expires->prof_exp, exp)) + cputime_expires->prof_exp = exp; break; case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) - cputime_expires->virt_exp = expires_to_cputime(exp); + if (expires_gt(cputime_expires->virt_exp, exp)) + cputime_expires->virt_exp = exp; break; case CPUCLOCK_SCHED: - if (cputime_expires->sched_exp == 0 || - cputime_expires->sched_exp > exp) + if (expires_gt(cputime_expires->sched_exp, exp)) cputime_expires->sched_exp = exp; break; } @@ -559,8 +534,7 @@ static void cpu_timer_fire(struct k_itimer *timer) * traversal. */ static int cpu_timer_sample_group(const clockid_t which_clock, - struct task_struct *p, - unsigned long long *sample) + struct task_struct *p, u64 *sample) { struct task_cputime cputime; @@ -569,10 +543,10 @@ static int cpu_timer_sample_group(const clockid_t which_clock, default: return -EINVAL; case CPUCLOCK_PROF: - *sample = cputime_to_expires(cputime.utime + cputime.stime); + *sample = cputime.utime + cputime.stime; break; case CPUCLOCK_VIRT: - *sample = cputime_to_expires(cputime.utime); + *sample = cputime.utime; break; case CPUCLOCK_SCHED: *sample = cputime.sum_exec_runtime; @@ -588,17 +562,17 @@ static int cpu_timer_sample_group(const clockid_t which_clock, * and try again. (This happens when the timer is in the middle of firing.) */ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, - struct itimerspec *new, struct itimerspec *old) + struct itimerspec64 *new, struct itimerspec64 *old) { unsigned long flags; struct sighand_struct *sighand; struct task_struct *p = timer->it.cpu.task; - unsigned long long old_expires, new_expires, old_incr, val; + u64 old_expires, new_expires, old_incr, val; int ret; WARN_ON_ONCE(p == NULL); - new_expires = timespec_to_sample(timer->it_clock, &new->it_value); + new_expires = timespec64_to_ns(&new->it_value); /* * Protect against sighand release/switch in exit/exec and p->cpu_timers @@ -659,9 +633,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, bump_cpu_timer(timer, val); if (val < timer->it.cpu.expires) { old_expires = timer->it.cpu.expires - val; - sample_to_timespec(timer->it_clock, - old_expires, - &old->it_value); + old->it_value = ns_to_timespec64(old_expires); } else { old->it_value.tv_nsec = 1; old->it_value.tv_sec = 0; @@ -699,8 +671,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * Install the new reload setting, and * set up the signal and overrun bookkeeping. */ - timer->it.cpu.incr = timespec_to_sample(timer->it_clock, - &new->it_interval); + timer->it.cpu.incr = timespec64_to_ns(&new->it_interval); /* * This acts as a modification timestamp for the timer, @@ -723,17 +694,15 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, ret = 0; out: - if (old) { - sample_to_timespec(timer->it_clock, - old_incr, &old->it_interval); - } + if (old) + old->it_interval = ns_to_timespec64(old_incr); return ret; } -static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) { - unsigned long long now; + u64 now; struct task_struct *p = timer->it.cpu.task; WARN_ON_ONCE(p == NULL); @@ -741,8 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) /* * Easy part: convert the reload time. */ - sample_to_timespec(timer->it_clock, - timer->it.cpu.incr, &itp->it_interval); + itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; @@ -771,8 +739,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) * Call the timer disarmed, nothing else to do. */ timer->it.cpu.expires = 0; - sample_to_timespec(timer->it_clock, timer->it.cpu.expires, - &itp->it_value); + itp->it_value = ns_to_timespec64(timer->it.cpu.expires); return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); @@ -781,9 +748,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) } if (now < timer->it.cpu.expires) { - sample_to_timespec(timer->it_clock, - timer->it.cpu.expires - now, - &itp->it_value); + itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now); } else { /* * The timer should have expired already, but the firing @@ -827,7 +792,7 @@ static void check_thread_timers(struct task_struct *tsk, struct list_head *timers = tsk->cpu_timers; struct signal_struct *const sig = tsk->signal; struct task_cputime *tsk_expires = &tsk->cputime_expires; - unsigned long long expires; + u64 expires; unsigned long soft; /* @@ -838,10 +803,10 @@ static void check_thread_timers(struct task_struct *tsk, return; expires = check_timers_list(timers, firing, prof_ticks(tsk)); - tsk_expires->prof_exp = expires_to_cputime(expires); + tsk_expires->prof_exp = expires; expires = check_timers_list(++timers, firing, virt_ticks(tsk)); - tsk_expires->virt_exp = expires_to_cputime(expires); + tsk_expires->virt_exp = expires; tsk_expires->sched_exp = check_timers_list(++timers, firing, tsk->se.sum_exec_runtime); @@ -860,6 +825,8 @@ static void check_thread_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ + pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -871,8 +838,7 @@ static void check_thread_timers(struct task_struct *tsk, soft += USEC_PER_SEC; sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; } - printk(KERN_INFO - "RT Watchdog Timeout: %s[%d]\n", + pr_info("RT Watchdog Timeout (soft): %s[%d]\n", tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } @@ -890,26 +856,17 @@ static inline void stop_process_timers(struct signal_struct *sig) tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); } -static u32 onecputick; - static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, - unsigned long long *expires, - unsigned long long cur_time, int signo) + u64 *expires, u64 cur_time, int signo) { if (!it->expires) return; if (cur_time >= it->expires) { - if (it->incr) { + if (it->incr) it->expires += it->incr; - it->error += it->incr_error; - if (it->error >= onecputick) { - it->expires -= cputime_one_jiffy; - it->error -= onecputick; - } - } else { + else it->expires = 0; - } trace_itimer_expire(signo == SIGPROF ? ITIMER_PROF : ITIMER_VIRTUAL, @@ -917,9 +874,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } - if (it->expires && (!*expires || it->expires < *expires)) { + if (it->expires && (!*expires || it->expires < *expires)) *expires = it->expires; - } } /* @@ -931,8 +887,8 @@ static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { struct signal_struct *const sig = tsk->signal; - unsigned long long utime, ptime, virt_expires, prof_expires; - unsigned long long sum_sched_runtime, sched_expires; + u64 utime, ptime, virt_expires, prof_expires; + u64 sum_sched_runtime, sched_expires; struct list_head *timers = sig->cpu_timers; struct task_cputime cputime; unsigned long soft; @@ -954,8 +910,8 @@ static void check_process_timers(struct task_struct *tsk, * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); - utime = cputime_to_expires(cputime.utime); - ptime = utime + cputime_to_expires(cputime.stime); + utime = cputime.utime; + ptime = utime + cputime.stime; sum_sched_runtime = cputime.sum_exec_runtime; prof_expires = check_timers_list(timers, firing, ptime); @@ -971,15 +927,17 @@ static void check_process_timers(struct task_struct *tsk, SIGVTALRM); soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (soft != RLIM_INFINITY) { - unsigned long psecs = cputime_to_secs(ptime); + unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); unsigned long hard = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); - cputime_t x; + u64 x; if (psecs >= hard) { /* * At the hard limit, we just die. * No need to calculate anything else now. */ + pr_info("RT Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -987,20 +945,21 @@ static void check_process_timers(struct task_struct *tsk, /* * At the soft limit, send a SIGXCPU every second. */ + pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); if (soft < hard) { soft++; sig->rlim[RLIMIT_CPU].rlim_cur = soft; } } - x = secs_to_cputime(soft); - if (!prof_expires || x < prof_expires) { + x = soft * NSEC_PER_SEC; + if (!prof_expires || x < prof_expires) prof_expires = x; - } } - sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); - sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); + sig->cputime_expires.prof_exp = prof_expires; + sig->cputime_expires.virt_exp = virt_expires; sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); @@ -1017,7 +976,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) struct sighand_struct *sighand; unsigned long flags; struct task_struct *p = timer->it.cpu.task; - unsigned long long now; + u64 now; WARN_ON_ONCE(p == NULL); @@ -1214,9 +1173,9 @@ void run_posix_cpu_timers(struct task_struct *tsk) * The tsk->sighand->siglock must be held by the caller. */ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, - cputime_t *newval, cputime_t *oldval) + u64 *newval, u64 *oldval) { - unsigned long long now; + u64 now; WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); @@ -1230,7 +1189,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (*oldval) { if (*oldval <= now) { /* Just about to fire. */ - *oldval = cputime_one_jiffy; + *oldval = TICK_NSEC; } else { *oldval -= now; } @@ -1260,7 +1219,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct itimerspec *it) + struct timespec64 *rqtp, struct itimerspec64 *it) { struct k_itimer timer; int error; @@ -1275,7 +1234,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, error = posix_cpu_timer_create(&timer); timer.it_process = current; if (!error) { - static struct itimerspec zero_it; + static struct itimerspec64 zero_it; memset(it, 0, sizeof *it); it->it_value = *rqtp; @@ -1310,7 +1269,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, /* * We were interrupted by a signal. */ - sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); + *rqtp = ns_to_timespec64(timer.it.cpu.expires); error = posix_cpu_timer_set(&timer, 0, &zero_it, it); if (!error) { /* @@ -1347,10 +1306,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) + struct timespec64 *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = ¤t->restart_block; - struct itimerspec it; + struct itimerspec64 it; + struct timespec ts; int error; /* @@ -1358,7 +1318,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, */ if (CPUCLOCK_PERTHREAD(which_clock) && (CPUCLOCK_PID(which_clock) == 0 || - CPUCLOCK_PID(which_clock) == current->pid)) + CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) return -EINVAL; error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); @@ -1370,13 +1330,14 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, /* * Report back to the user the time still remaining. */ - if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + ts = timespec64_to_timespec(it.it_value); + if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp))) return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; restart_block->nanosleep.clockid = which_clock; restart_block->nanosleep.rmtp = rmtp; - restart_block->nanosleep.expires = timespec_to_ns(rqtp); + restart_block->nanosleep.expires = timespec64_to_ns(rqtp); } return error; } @@ -1384,11 +1345,12 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->nanosleep.clockid; - struct timespec t; - struct itimerspec it; + struct itimerspec64 it; + struct timespec64 t; + struct timespec tmp; int error; - t = ns_to_timespec(restart_block->nanosleep.expires); + t = ns_to_timespec64(restart_block->nanosleep.expires); error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); @@ -1397,10 +1359,11 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) /* * Report back to the user the time still remaining. */ - if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + tmp = timespec64_to_timespec(it.it_value); + if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp))) return -EFAULT; - restart_block->nanosleep.expires = timespec_to_ns(&t); + restart_block->nanosleep.expires = timespec64_to_ns(&t); } return error; @@ -1410,12 +1373,12 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) static int process_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_getres(PROCESS_CLOCK, tp); } static int process_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_get(PROCESS_CLOCK, tp); } @@ -1425,7 +1388,7 @@ static int process_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } static int process_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, + struct timespec64 *rqtp, struct timespec __user *rmtp) { return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); @@ -1435,12 +1398,12 @@ static long process_cpu_nsleep_restart(struct restart_block *restart_block) return -EINVAL; } static int thread_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_getres(THREAD_CLOCK, tp); } static int thread_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_get(THREAD_CLOCK, tp); } @@ -1476,15 +1439,10 @@ static __init int init_posix_cpu_timers(void) .clock_get = thread_cpu_clock_get, .timer_create = thread_cpu_timer_create, }; - struct timespec ts; posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); - cputime_to_timespec(cputime_one_jiffy, &ts); - onecputick = ts.tv_nsec; - WARN_ON(ts.tv_sec != 0); - return 0; } __initcall(init_posix_cpu_timers); diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index cd6716e115e8..c0cd53eb018a 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -49,26 +49,32 @@ SYS_NI(alarm); SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { + struct timespec64 new_tp64; struct timespec new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; if (copy_from_user(&new_tp, tp, sizeof (*tp))) return -EFAULT; - return do_sys_settimeofday(&new_tp, NULL); + + new_tp64 = timespec_to_timespec64(new_tp); + return do_sys_settimeofday64(&new_tp64, NULL); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { + struct timespec64 kernel_tp64; struct timespec kernel_tp; switch (which_clock) { - case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break; - case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break; - case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break; + case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; + case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; + case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; default: return -EINVAL; } + + kernel_tp = timespec64_to_timespec(kernel_tp64); if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) return -EFAULT; return 0; @@ -97,6 +103,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { + struct timespec64 t64; struct timespec t; switch (which_clock) { @@ -105,9 +112,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, case CLOCK_BOOTTIME: if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; - if (!timespec_valid(&t)) + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) return -EINVAL; - return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); default: diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1e6623d76750..4d7b2ce09c27 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -35,6 +35,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/mutex.h> +#include <linux/sched/task.h> #include <linux/uaccess.h> #include <linux/list.h> @@ -129,12 +130,12 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; /* * These ones are defined below. */ -static int common_nsleep(const clockid_t, int flags, struct timespec *t, +static int common_nsleep(const clockid_t, int flags, struct timespec64 *t, struct timespec __user *rmtp); static int common_timer_create(struct k_itimer *new_timer); -static void common_timer_get(struct k_itimer *, struct itimerspec *); +static void common_timer_get(struct k_itimer *, struct itimerspec64 *); static int common_timer_set(struct k_itimer *, int, - struct itimerspec *, struct itimerspec *); + struct itimerspec64 *, struct itimerspec64 *); static int common_timer_del(struct k_itimer *timer); static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); @@ -203,17 +204,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) } /* Get clock_realtime */ -static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp) { - ktime_get_real_ts(tp); + ktime_get_real_ts64(tp); return 0; } /* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, - const struct timespec *tp) + const struct timespec64 *tp) { - return do_sys_settimeofday(tp, NULL); + return do_sys_settimeofday64(tp, NULL); } static int posix_clock_realtime_adj(const clockid_t which_clock, @@ -225,54 +226,54 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, /* * Get monotonic time for posix timers */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) +static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) { - ktime_get_ts(tp); + ktime_get_ts64(tp); return 0; } /* * Get monotonic-raw time for posix timers */ -static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { - getrawmonotonic(tp); + getrawmonotonic64(tp); return 0; } -static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) { - *tp = current_kernel_time(); + *tp = current_kernel_time64(); return 0; } static int posix_get_monotonic_coarse(clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { - *tp = get_monotonic_coarse(); + *tp = get_monotonic_coarse64(); return 0; } -static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp) { - *tp = ktime_to_timespec(KTIME_LOW_RES); + *tp = ktime_to_timespec64(KTIME_LOW_RES); return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) +static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) { - get_monotonic_boottime(tp); + get_monotonic_boottime64(tp); return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec *tp) +static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) { - timekeeping_clocktai(tp); + timekeeping_clocktai64(tp); return 0; } -static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp) +static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) { tp->tv_sec = 0; tp->tv_nsec = hrtimer_resolution; @@ -733,18 +734,18 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * report. */ static void -common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) +common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { ktime_t now, remaining, iv; struct hrtimer *timer = &timr->it.real.timer; - memset(cur_setting, 0, sizeof(struct itimerspec)); + memset(cur_setting, 0, sizeof(*cur_setting)); iv = timr->it.real.interval; /* interval timer ? */ if (iv) - cur_setting->it_interval = ktime_to_timespec(iv); + cur_setting->it_interval = ktime_to_timespec64(iv); else if (!hrtimer_active(timer) && (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) return; @@ -770,13 +771,14 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) cur_setting->it_value.tv_nsec = 1; } else - cur_setting->it_value = ktime_to_timespec(remaining); + cur_setting->it_value = ktime_to_timespec64(remaining); } /* Get the time remaining on a POSIX.1b interval timer. */ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec __user *, setting) { + struct itimerspec64 cur_setting64; struct itimerspec cur_setting; struct k_itimer *timr; struct k_clock *kc; @@ -791,10 +793,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; else - kc->timer_get(timr, &cur_setting); + kc->timer_get(timr, &cur_setting64); unlock_timer(timr, flags); + cur_setting = itimerspec64_to_itimerspec(&cur_setting64); if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) return -EFAULT; @@ -830,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) /* timr->it_lock is taken. */ static int common_timer_set(struct k_itimer *timr, int flags, - struct itimerspec *new_setting, struct itimerspec *old_setting) + struct itimerspec64 *new_setting, struct itimerspec64 *old_setting) { struct hrtimer *timer = &timr->it.real.timer; enum hrtimer_mode mode; @@ -859,10 +862,10 @@ common_timer_set(struct k_itimer *timr, int flags, hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; - hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); + hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value)); /* Convert interval */ - timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); + timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval); /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { @@ -882,21 +885,23 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct itimerspec __user *, new_setting, struct itimerspec __user *, old_setting) { - struct k_itimer *timr; + struct itimerspec64 new_spec64, old_spec64; + struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; struct itimerspec new_spec, old_spec; - int error = 0; + struct k_itimer *timr; unsigned long flag; - struct itimerspec *rtn = old_setting ? &old_spec : NULL; struct k_clock *kc; + int error = 0; if (!new_setting) return -EINVAL; if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) return -EFAULT; + new_spec64 = itimerspec_to_itimerspec64(&new_spec); - if (!timespec_valid(&new_spec.it_interval) || - !timespec_valid(&new_spec.it_value)) + if (!timespec64_valid(&new_spec64.it_interval) || + !timespec64_valid(&new_spec64.it_value)) return -EINVAL; retry: timr = lock_timer(timer_id, &flag); @@ -907,7 +912,7 @@ retry: if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else - error = kc->timer_set(timr, flags, &new_spec, rtn); + error = kc->timer_set(timr, flags, &new_spec64, rtn); unlock_timer(timr, flag); if (error == TIMER_RETRY) { @@ -915,6 +920,7 @@ retry: goto retry; } + old_spec = itimerspec64_to_itimerspec(&old_spec64); if (old_setting && !error && copy_to_user(old_setting, &old_spec, sizeof (old_spec))) error = -EFAULT; @@ -1013,6 +1019,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 new_tp64; struct timespec new_tp; if (!kc || !kc->clock_set) @@ -1020,21 +1027,24 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, if (copy_from_user(&new_tp, tp, sizeof (*tp))) return -EFAULT; + new_tp64 = timespec_to_timespec64(new_tp); - return kc->clock_set(which_clock, &new_tp); + return kc->clock_set(which_clock, &new_tp64); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 kernel_tp64; struct timespec kernel_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp); + error = kc->clock_get(which_clock, &kernel_tp64); + kernel_tp = timespec64_to_timespec(kernel_tp64); if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) error = -EFAULT; @@ -1069,13 +1079,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 rtn_tp64; struct timespec rtn_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_getres(which_clock, &rtn_tp); + error = kc->clock_getres(which_clock, &rtn_tp64); + rtn_tp = timespec64_to_timespec(rtn_tp64); if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) error = -EFAULT; @@ -1087,7 +1099,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, * nanosleep for monotonic and realtime clocks */ static int common_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsave, struct timespec __user *rmtp) + struct timespec64 *tsave, struct timespec __user *rmtp) { return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, @@ -1099,6 +1111,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, struct timespec __user *, rmtp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 t64; struct timespec t; if (!kc) @@ -1109,10 +1122,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; - if (!timespec_valid(&t)) + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) return -EINVAL; - return kc->nsleep(which_clock, flags, &t, rmtp); + return kc->nsleep(which_clock, flags, &t64, rmtp); } /* diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index a26036d37a38..2d8f05aad442 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/syscore_ops.h> #include <linux/hrtimer.h> #include <linux/sched_clock.h> @@ -205,6 +206,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) update_clock_read_data(&rd); + if (sched_clock_timer.function != NULL) { + /* update timeout for clock wrap */ + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + } + r = rate; if (r >= 4000000) { r /= 1000000; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 17ac99b60ee5..987e496bb51a 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -29,12 +29,13 @@ */ static struct tick_device tick_broadcast_device; -static cpumask_var_t tick_broadcast_mask; -static cpumask_var_t tick_broadcast_on; -static cpumask_var_t tmpmask; -static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); +static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly; +static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly; +static cpumask_var_t tmpmask __cpumask_var_read_mostly; static int tick_broadcast_forced; +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); + #ifdef CONFIG_TICK_ONESHOT static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); @@ -516,9 +517,9 @@ void tick_resume_broadcast(void) #ifdef CONFIG_TICK_ONESHOT -static cpumask_var_t tick_broadcast_oneshot_mask; -static cpumask_var_t tick_broadcast_pending_mask; -static cpumask_var_t tick_broadcast_force_mask; +static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly; +static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly; +static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly; /* * Exposed for debugging: see timer_list.c diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2c115fdab397..64c97fc130c4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -17,8 +17,12 @@ #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> +#include <linux/nmi.h> #include <linux/profile.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/clock.h> +#include <linux/sched/stat.h> +#include <linux/sched/nohz.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> @@ -989,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +/** + * tick_nohz_get_idle_calls - return the current idle calls counter value + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->idle_calls; +} + static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE diff --git a/kernel/time/time.c b/kernel/time/time.c index a3a9a8a029dc..49c73c6ed648 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -193,8 +193,8 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { + struct timespec64 new_ts; struct timeval user_tv; - struct timespec new_ts; struct timezone new_tz; if (tv) { @@ -212,7 +212,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, return -EFAULT; } - return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) @@ -230,20 +230,6 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; } -/** - * current_fs_time - Return FS time - * @sb: Superblock. - * - * Return the current time truncated to the time granularity supported by - * the fs. - */ -struct timespec current_fs_time(struct super_block *sb) -{ - struct timespec now = current_kernel_time(); - return timespec_trunc(now, sb->s_time_gran); -} -EXPORT_SYMBOL(current_fs_time); - /* * Convert jiffies to milliseconds and back. * @@ -702,6 +688,16 @@ u64 nsec_to_clock_t(u64 x) #endif } +u64 jiffies64_to_nsecs(u64 j) +{ +#if !(NSEC_PER_SEC % HZ) + return (NSEC_PER_SEC / HZ) * j; +# else + return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN); +#endif +} +EXPORT_SYMBOL(jiffies64_to_nsecs); + /** * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc index c48688904f9f..f83bbb81600b 100644 --- a/kernel/time/timeconst.bc +++ b/kernel/time/timeconst.bc @@ -98,6 +98,12 @@ define timeconst(hz) { print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" + + cd=gcd(hz,1000000000) + print "#define HZ_TO_NSEC_NUM\t\t", 1000000000/cd, "\n" + print "#define HZ_TO_NSEC_DEN\t\t", hz/cd, "\n" + print "#define NSEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define NSEC_TO_HZ_DEN\t\t", 1000000000/cd, "\n" print "\n" print "#endif /* KERNEL_TIMECONST_H */\n" diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index db087d7e106d..9652bc57fd09 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -14,7 +14,9 @@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/nmi.h> #include <linux/sched.h> +#include <linux/sched/loadavg.h> #include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> @@ -994,8 +996,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history, return 0; /* Interpolate shortest distance from beginning or end of history */ - interp_forward = partial_history_cycles > total_history_cycles/2 ? - true : false; + interp_forward = partial_history_cycles > total_history_cycles / 2; partial_history_cycles = interp_forward ? total_history_cycles - partial_history_cycles : partial_history_cycles; @@ -1275,27 +1276,8 @@ error: /* even if we error out, we forwarded the time, so call update */ } EXPORT_SYMBOL(timekeeping_inject_offset); - -/** - * timekeeping_get_tai_offset - Returns current TAI offset from UTC - * - */ -s32 timekeeping_get_tai_offset(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; - s32 ret; - - do { - seq = read_seqcount_begin(&tk_core.seq); - ret = tk->tai_offset; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - return ret; -} - /** - * __timekeeping_set_tai_offset - Lock free worker function + * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic * */ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) @@ -1305,24 +1287,6 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) } /** - * timekeeping_set_tai_offset - Sets the current TAI offset from UTC - * - */ -void timekeeping_set_tai_offset(s32 tai_offset) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - __timekeeping_set_tai_offset(tk, tai_offset); - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - clock_was_set(); -} - -/** * change_clocksource - Swaps clocksources if a new one is available * * Accumulates current time interval and initializes new clocksource diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 704f595ce83f..d0914676d4c5 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -11,8 +11,6 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern int timekeeping_inject_offset(struct timespec *ts); -extern s32 timekeeping_get_tai_offset(void); -extern void timekeeping_set_tai_offset(s32 tai_offset); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ec33a6933eae..152a706ef8b8 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -38,8 +38,10 @@ #include <linux/tick.h> #include <linux/kallsyms.h> #include <linux/irq_work.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/sched/sysctl.h> +#include <linux/sched/nohz.h> +#include <linux/sched/debug.h> #include <linux/slab.h> #include <linux/compat.h> @@ -239,7 +241,7 @@ int timer_migration_handler(struct ctl_table *table, int write, int ret; mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) timers_update_migration(false); mutex_unlock(&mutex); @@ -571,38 +573,6 @@ internal_add_timer(struct timer_base *base, struct timer_list *timer) trigger_dyntick_cpu(base, timer); } -#ifdef CONFIG_TIMER_STATS -void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) -{ - if (timer->start_site) - return; - - timer->start_site = addr; - memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); - timer->start_pid = current->pid; -} - -static void timer_stats_account_timer(struct timer_list *timer) -{ - void *site; - - /* - * start_site can be concurrently reset by - * timer_stats_timer_clear_start_info() - */ - site = READ_ONCE(timer->start_site); - if (likely(!site)) - return; - - timer_stats_update_stats(timer, timer->start_pid, site, - timer->function, timer->start_comm, - timer->flags); -} - -#else -static void timer_stats_account_timer(struct timer_list *timer) {} -#endif - #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static struct debug_obj_descr timer_debug_descr; @@ -789,11 +759,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, { timer->entry.pprev = NULL; timer->flags = flags | raw_smp_processor_id(); -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; - timer->start_pid = -1; - memset(timer->start_comm, 0, TASK_COMM_LEN); -#endif lockdep_init_map(&timer->lockdep_map, name, key, 0); } @@ -1001,8 +966,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) base = lock_timer_base(timer, &flags); } - timer_stats_timer_set_start_info(timer); - ret = detach_if_pending(timer, base, false); if (!ret && pending_only) goto out_unlock; @@ -1130,7 +1093,6 @@ void add_timer_on(struct timer_list *timer, int cpu) struct timer_base *new_base, *base; unsigned long flags; - timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); new_base = get_timer_cpu_base(timer->flags, cpu); @@ -1158,7 +1120,7 @@ void add_timer_on(struct timer_list *timer, int cpu) EXPORT_SYMBOL_GPL(add_timer_on); /** - * del_timer - deactive a timer. + * del_timer - deactivate a timer. * @timer: the timer to be deactivated * * del_timer() deactivates a timer - this works on both active and inactive @@ -1176,7 +1138,6 @@ int del_timer(struct timer_list *timer) debug_assert_init(timer); - timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, true); @@ -1204,10 +1165,9 @@ int try_to_del_timer_sync(struct timer_list *timer) base = lock_timer_base(timer, &flags); - if (base->running_timer != timer) { - timer_stats_timer_clear_start_info(timer); + if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); - } + spin_unlock_irqrestore(&base->lock, flags); return ret; @@ -1331,7 +1291,6 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) unsigned long data; timer = hlist_entry(head->first, struct timer_list, entry); - timer_stats_account_timer(timer); base->running_timer = timer; detach_timer(timer, true); @@ -1868,7 +1827,6 @@ static void __init init_timer_cpus(void) void __init init_timers(void) { init_timer_cpus(); - init_timer_stats(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index afe6cd1944fc..0e7f5428a148 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -16,6 +16,7 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/kallsyms.h> +#include <linux/nmi.h> #include <linux/uaccess.h> @@ -62,21 +63,11 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { -#ifdef CONFIG_TIMER_STATS - char tmp[TASK_COMM_LEN + 1]; -#endif SEQ_printf(m, " #%d: ", idx); print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); SEQ_printf(m, ", S:%02x", timer->state); -#ifdef CONFIG_TIMER_STATS - SEQ_printf(m, ", "); - print_name_offset(m, timer->start_site); - memcpy(tmp, timer->start_comm, TASK_COMM_LEN); - tmp[TASK_COMM_LEN] = 0; - SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); -#endif SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), @@ -96,6 +87,9 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, next_one: i = 0; + + touch_nmi_watchdog(); + raw_spin_lock_irqsave(&base->cpu_base->lock, flags); curr = timerqueue_getnext(&base->active); @@ -127,7 +121,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) SEQ_printf(m, " .base: %pK\n", base); SEQ_printf(m, " .index: %d\n", base->index); - SEQ_printf(m, " .resolution: %u nsecs\n", (unsigned) hrtimer_resolution); + SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); SEQ_printf(m, " .get_time: "); print_name_offset(m, base->get_time); @@ -207,6 +201,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; + touch_nmi_watchdog(); + SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c deleted file mode 100644 index afddded947df..000000000000 --- a/kernel/time/timer_stats.c +++ /dev/null @@ -1,425 +0,0 @@ -/* - * kernel/time/timer_stats.c - * - * Collect timer usage statistics. - * - * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - * timer_stats is based on timer_top, a similar functionality which was part of - * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the - * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based - * on dynamic allocation of the statistics entries and linear search based - * lookup combined with a global lock, rather than the static array, hash - * and per-CPU locking which is used by timer_stats. It was written for the - * pre hrtimer kernel code and therefore did not take hrtimers into account. - * Nevertheless it provided the base for the timer_stats implementation and - * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks - * for this effort. - * - * timer_top.c is - * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus - * Written by Daniel Petrini <d.pensator@gmail.com> - * timer_top.c was released under the GNU General Public License version 2 - * - * We export the addresses and counting of timer functions being called, - * the pid and cmdline from the owner process if applicable. - * - * Start/stop data collection: - * # echo [1|0] >/proc/timer_stats - * - * Display the information collected so far: - * # cat /proc/timer_stats - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/proc_fs.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/sched.h> -#include <linux/seq_file.h> -#include <linux/kallsyms.h> - -#include <linux/uaccess.h> - -/* - * This is our basic unit of interest: a timer expiry event identified - * by the timer, its start/expire functions and the PID of the task that - * started the timer. We count the number of times an event happens: - */ -struct entry { - /* - * Hash list: - */ - struct entry *next; - - /* - * Hash keys: - */ - void *timer; - void *start_func; - void *expire_func; - pid_t pid; - - /* - * Number of timeout events: - */ - unsigned long count; - u32 flags; - - /* - * We save the command-line string to preserve - * this information past task exit: - */ - char comm[TASK_COMM_LEN + 1]; - -} ____cacheline_aligned_in_smp; - -/* - * Spinlock protecting the tables - not taken during lookup: - */ -static DEFINE_RAW_SPINLOCK(table_lock); - -/* - * Per-CPU lookup locks for fast hash lookup: - */ -static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock); - -/* - * Mutex to serialize state changes with show-stats activities: - */ -static DEFINE_MUTEX(show_mutex); - -/* - * Collection status, active/inactive: - */ -int __read_mostly timer_stats_active; - -/* - * Beginning/end timestamps of measurement: - */ -static ktime_t time_start, time_stop; - -/* - * tstat entry structs only get allocated while collection is - * active and never freed during that time - this simplifies - * things quite a bit. - * - * They get freed when a new collection period is started. - */ -#define MAX_ENTRIES_BITS 10 -#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) - -static unsigned long nr_entries; -static struct entry entries[MAX_ENTRIES]; - -static atomic_t overflow_count; - -/* - * The entries are in a hash-table, for fast lookup: - */ -#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) -#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) -#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) - -#define __tstat_hashfn(entry) \ - (((unsigned long)(entry)->timer ^ \ - (unsigned long)(entry)->start_func ^ \ - (unsigned long)(entry)->expire_func ^ \ - (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) - -#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) - -static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; - -static void reset_entries(void) -{ - nr_entries = 0; - memset(entries, 0, sizeof(entries)); - memset(tstat_hash_table, 0, sizeof(tstat_hash_table)); - atomic_set(&overflow_count, 0); -} - -static struct entry *alloc_entry(void) -{ - if (nr_entries >= MAX_ENTRIES) - return NULL; - - return entries + nr_entries++; -} - -static int match_entries(struct entry *entry1, struct entry *entry2) -{ - return entry1->timer == entry2->timer && - entry1->start_func == entry2->start_func && - entry1->expire_func == entry2->expire_func && - entry1->pid == entry2->pid; -} - -/* - * Look up whether an entry matching this item is present - * in the hash already. Must be called with irqs off and the - * lookup lock held: - */ -static struct entry *tstat_lookup(struct entry *entry, char *comm) -{ - struct entry **head, *curr, *prev; - - head = tstat_hashentry(entry); - curr = *head; - - /* - * The fastpath is when the entry is already hashed, - * we do this with the lookup lock held, but with the - * table lock not held: - */ - while (curr) { - if (match_entries(curr, entry)) - return curr; - - curr = curr->next; - } - /* - * Slowpath: allocate, set up and link a new hash entry: - */ - prev = NULL; - curr = *head; - - raw_spin_lock(&table_lock); - /* - * Make sure we have not raced with another CPU: - */ - while (curr) { - if (match_entries(curr, entry)) - goto out_unlock; - - prev = curr; - curr = curr->next; - } - - curr = alloc_entry(); - if (curr) { - *curr = *entry; - curr->count = 0; - curr->next = NULL; - memcpy(curr->comm, comm, TASK_COMM_LEN); - - smp_mb(); /* Ensure that curr is initialized before insert */ - - if (prev) - prev->next = curr; - else - *head = curr; - } - out_unlock: - raw_spin_unlock(&table_lock); - - return curr; -} - -/** - * timer_stats_update_stats - Update the statistics for a timer. - * @timer: pointer to either a timer_list or a hrtimer - * @pid: the pid of the task which set up the timer - * @startf: pointer to the function which did the timer setup - * @timerf: pointer to the timer callback function of the timer - * @comm: name of the process which set up the timer - * @tflags: The flags field of the timer - * - * When the timer is already registered, then the event counter is - * incremented. Otherwise the timer is registered in a free slot. - */ -void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, u32 tflags) -{ - /* - * It doesn't matter which lock we take: - */ - raw_spinlock_t *lock; - struct entry *entry, input; - unsigned long flags; - - if (likely(!timer_stats_active)) - return; - - lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id()); - - input.timer = timer; - input.start_func = startf; - input.expire_func = timerf; - input.pid = pid; - input.flags = tflags; - - raw_spin_lock_irqsave(lock, flags); - if (!timer_stats_active) - goto out_unlock; - - entry = tstat_lookup(&input, comm); - if (likely(entry)) - entry->count++; - else - atomic_inc(&overflow_count); - - out_unlock: - raw_spin_unlock_irqrestore(lock, flags); -} - -static void print_name_offset(struct seq_file *m, unsigned long addr) -{ - char symname[KSYM_NAME_LEN]; - - if (lookup_symbol_name(addr, symname) < 0) - seq_printf(m, "<%p>", (void *)addr); - else - seq_printf(m, "%s", symname); -} - -static int tstats_show(struct seq_file *m, void *v) -{ - struct timespec64 period; - struct entry *entry; - unsigned long ms; - long events = 0; - ktime_t time; - int i; - - mutex_lock(&show_mutex); - /* - * If still active then calculate up to now: - */ - if (timer_stats_active) - time_stop = ktime_get(); - - time = ktime_sub(time_stop, time_start); - - period = ktime_to_timespec64(time); - ms = period.tv_nsec / 1000000; - - seq_puts(m, "Timer Stats Version: v0.3\n"); - seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms); - if (atomic_read(&overflow_count)) - seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); - seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); - - for (i = 0; i < nr_entries; i++) { - entry = entries + i; - if (entry->flags & TIMER_DEFERRABLE) { - seq_printf(m, "%4luD, %5d %-16s ", - entry->count, entry->pid, entry->comm); - } else { - seq_printf(m, " %4lu, %5d %-16s ", - entry->count, entry->pid, entry->comm); - } - - print_name_offset(m, (unsigned long)entry->start_func); - seq_puts(m, " ("); - print_name_offset(m, (unsigned long)entry->expire_func); - seq_puts(m, ")\n"); - - events += entry->count; - } - - ms += period.tv_sec * 1000; - if (!ms) - ms = 1; - - if (events && period.tv_sec) - seq_printf(m, "%ld total events, %ld.%03ld events/sec\n", - events, events * 1000 / ms, - (events * 1000000 / ms) % 1000); - else - seq_printf(m, "%ld total events\n", events); - - mutex_unlock(&show_mutex); - - return 0; -} - -/* - * After a state change, make sure all concurrent lookup/update - * activities have stopped: - */ -static void sync_access(void) -{ - unsigned long flags; - int cpu; - - for_each_online_cpu(cpu) { - raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu); - - raw_spin_lock_irqsave(lock, flags); - /* nothing */ - raw_spin_unlock_irqrestore(lock, flags); - } -} - -static ssize_t tstats_write(struct file *file, const char __user *buf, - size_t count, loff_t *offs) -{ - char ctl[2]; - - if (count != 2 || *offs) - return -EINVAL; - - if (copy_from_user(ctl, buf, count)) - return -EFAULT; - - mutex_lock(&show_mutex); - switch (ctl[0]) { - case '0': - if (timer_stats_active) { - timer_stats_active = 0; - time_stop = ktime_get(); - sync_access(); - } - break; - case '1': - if (!timer_stats_active) { - reset_entries(); - time_start = ktime_get(); - smp_mb(); - timer_stats_active = 1; - } - break; - default: - count = -EINVAL; - } - mutex_unlock(&show_mutex); - - return count; -} - -static int tstats_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, tstats_show, NULL); -} - -static const struct file_operations tstats_fops = { - .open = tstats_open, - .read = seq_read, - .write = tstats_write, - .llseek = seq_lseek, - .release = single_release, -}; - -void __init init_timer_stats(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu)); -} - -static int __init init_tstats_procfs(void) -{ - struct proc_dir_entry *pe; - - pe = proc_create("timer_stats", 0644, NULL, &tstats_fops); - if (!pe) - return -ENOMEM; - return 0; -} -__initcall(init_tstats_procfs); diff --git a/kernel/torture.c b/kernel/torture.c index 0d887eb62856..55de96529287 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -30,6 +30,7 @@ #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/completion.h> @@ -311,7 +312,7 @@ EXPORT_SYMBOL_GPL(torture_random); /* * Variables for shuffling. The idea is to ensure that each CPU stays * idle for an extended period to test interactions with dyntick idle, - * as well as interactions with any per-CPU varibles. + * as well as interactions with any per-CPU variables. */ struct shuffle_task { struct list_head st_l; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d5038005eb5d..7e06f04e98fe 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -134,7 +134,8 @@ config FUNCTION_TRACER select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER - select GLOB + select GLOB + select TASKS_RCU if PREEMPT help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation @@ -429,7 +430,7 @@ config BLK_DEV_IO_TRACE If unsure, say N. -config KPROBE_EVENT +config KPROBE_EVENTS depends on KPROBES depends on HAVE_REGS_AND_STACK_ACCESS_API bool "Enable kprobes-based dynamic events" @@ -447,7 +448,7 @@ config KPROBE_EVENT This option is also required by perf-probe subcommand of perf tools. If you want to use perf tools, this option is strongly recommended. -config UPROBE_EVENT +config UPROBE_EVENTS bool "Enable uprobes-based dynamic events" depends on ARCH_SUPPORTS_UPROBES depends on MMU @@ -455,7 +456,7 @@ config UPROBE_EVENT select UPROBES select PROBE_EVENTS select TRACING - default n + default y help This allows the user to add tracing events on top of userspace dynamic events (similar to tracepoints) on the fly via the trace @@ -466,7 +467,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS + depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS bool default y help diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index e57980845549..90f2701d92a7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -57,7 +57,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o -obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o @@ -66,7 +66,7 @@ ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o -obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o +obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 95cecbf67f5c..bd8ae8d5ae9c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -28,6 +28,8 @@ #include <linux/uaccess.h> #include <linux/list.h> +#include "../../block/blk.h" + #include <trace/events/block.h> #include "trace_output.h" @@ -292,9 +294,6 @@ record_it: local_irq_restore(flags); } -static struct dentry *blk_tree_root; -static DEFINE_MUTEX(blk_tree_mutex); - static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); @@ -433,9 +432,9 @@ static void blk_trace_setup_lba(struct blk_trace *bt, /* * Setup everything required to start tracing */ -int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - struct blk_user_trace_setup *buts) +static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + struct blk_user_trace_setup *buts) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; @@ -468,22 +467,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -ENOENT; - mutex_lock(&blk_tree_mutex); - if (!blk_tree_root) { - blk_tree_root = debugfs_create_dir("block", NULL); - if (!blk_tree_root) { - mutex_unlock(&blk_tree_mutex); - goto err; - } - } - mutex_unlock(&blk_tree_mutex); - - dir = debugfs_create_dir(buts->name, blk_tree_root); + if (!blk_debugfs_root) + goto err; + dir = debugfs_lookup(buts->name, blk_debugfs_root); + if (!dir) + bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); if (!dir) goto err; - bt->dir = dir; bt->dev = dev; atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); @@ -525,9 +517,12 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); - return 0; + ret = 0; err: - blk_trace_free(bt); + if (dir && !bt->dir) + dput(dir); + if (ret) + blk_trace_free(bt); return ret; } @@ -695,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q) /** * blk_add_trace_rq - Add a trace for a request oriented action - * @q: queue the io is for * @rq: the source request + * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action * @@ -704,56 +699,46 @@ void blk_trace_shutdown(struct request_queue *q) * Records an action against a request. Will log the bio offset + size. * **/ -static void blk_add_trace_rq(struct request_queue *q, struct request *rq, +static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt = rq->q->blk_trace; if (likely(!bt)) return; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { + if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags, - what, rq->errors, rq->cmd_len, rq->cmd); - } else { + else what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, rq->errors, 0, NULL); - } -} -static void blk_add_trace_rq_abort(void *ignore, - struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT); + __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), + rq->cmd_flags, what, error, 0, NULL); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); } -static void blk_add_trace_rq_complete(void *ignore, - struct request_queue *q, - struct request *rq, - unsigned int nr_bytes) +static void blk_add_trace_rq_complete(void *ignore, struct request *rq, + int error, unsigned int nr_bytes) { - blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); } /** @@ -948,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, + rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r); } @@ -972,12 +957,8 @@ void blk_add_driver_data(struct request_queue *q, if (likely(!bt)) return; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) - __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, rq->errors, len, data); - else - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, rq->errors, len, data); + __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, + BLK_TA_DRV_DATA, 0, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -985,8 +966,6 @@ static void blk_register_tracepoints(void) { int ret; - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); - WARN_ON(ret); ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); @@ -1039,7 +1018,6 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); tracepoint_synchronize_unregister(); } @@ -1752,31 +1730,6 @@ void blk_trace_remove_sysfs(struct device *dev) #ifdef CONFIG_EVENT_TRACING -void blk_dump_cmd(char *buf, struct request *rq) -{ - int i, end; - int len = rq->cmd_len; - unsigned char *cmd = rq->cmd; - - if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { - buf[0] = '\0'; - return; - } - - for (end = len - 1; end >= 0; end--) - if (cmd[end]) - break; - end++; - - for (i = 0; i < len; i++) { - buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); - if (i == end && end != len - 1) { - sprintf(buf, " .."); - break; - } - } -} - void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes) { int i = 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fa77311dadb2..460a031c77e5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .func = bpf_probe_read, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_RAW_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, }; @@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, if (unlikely(in_interrupt() || current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; - if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + if (unlikely(uaccess_kernel())) return -EPERM; if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) return -EPERM; @@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto *bpf_get_probe_write_proto(void) @@ -213,8 +213,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, }; const struct bpf_func_proto *bpf_get_trace_printk_proto(void) @@ -329,8 +329,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); @@ -395,6 +395,36 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + int ret; + + /* + * The strncpy_from_unsafe() call will likely not fill the entire + * buffer, but that's okay in this circumstance as we're probing + * arbitrary memory anyway similar to bpf_probe_read() and might + * as well probe the stack. Thus, memory is explicitly cleared + * only in error case, so that improper users ignoring return + * code altogether don't copy garbage; otherwise length of string + * is returned that can be used for bpf_perf_event_output() et al. + */ + ret = strncpy_from_unsafe(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; +} + +static const struct bpf_func_proto bpf_probe_read_str_proto = { + .func = bpf_probe_read_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -432,6 +462,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_read_str: + return &bpf_probe_read_str_proto; default: return NULL; } @@ -459,19 +491,21 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return false; if (off % size != 0) return false; + /* + * Assertion for 32 bit to make sure last 8 byte access + * (BPF_DW) to the last 4 byte member is disallowed. + */ + if (off + size > sizeof(struct pt_regs)) + return false; + return true; } -static const struct bpf_verifier_ops kprobe_prog_ops = { +const struct bpf_verifier_ops kprobe_prog_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; -static struct bpf_prog_type_list kprobe_tl = { - .ops = &kprobe_prog_ops, - .type = BPF_PROG_TYPE_KPROBE, -}; - BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { @@ -492,8 +526,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, @@ -540,19 +574,16 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return false; if (off % size != 0) return false; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); return true; } -static const struct bpf_verifier_ops tracepoint_prog_ops = { +const struct bpf_verifier_ops tracepoint_prog_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, }; -static struct bpf_prog_type_list tracepoint_tl = { - .ops = &tracepoint_prog_ops, - .type = BPF_PROG_TYPE_TRACEPOINT, -}; - static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type) { @@ -572,50 +603,37 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type return true; } -static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct bpf_perf_event_data, sample_period): BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, - data), dst_reg, src_reg, + data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); - *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, offsetof(struct perf_sample_data, period)); break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, - regs), dst_reg, src_reg, + regs), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, regs)); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg, + si->off); break; } return insn - insn_buf; } -static const struct bpf_verifier_ops perf_event_prog_ops = { +const struct bpf_verifier_ops perf_event_prog_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; - -static struct bpf_prog_type_list perf_event_tl = { - .ops = &perf_event_prog_ops, - .type = BPF_PROG_TYPE_PERF_EVENT, -}; - -static int __init register_kprobe_prog_ops(void) -{ - bpf_register_prog_type(&kprobe_tl); - bpf_register_prog_type(&tracepoint_tl); - bpf_register_prog_type(&perf_event_tl); - return 0; -} -late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eb230f06ba41..39dca4e86a94 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -15,6 +15,7 @@ #include <linux/stop_machine.h> #include <linux/clocksource.h> +#include <linux/sched/task.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> #include <linux/suspend.h> @@ -35,6 +36,7 @@ #include <trace/events/sched.h> +#include <asm/sections.h> #include <asm/setup.h> #include "trace_output.h" @@ -1094,27 +1096,18 @@ static bool update_all_ops; # error Dynamic ftrace depends on MCOUNT_RECORD #endif -static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; - -struct ftrace_func_probe { - struct hlist_node node; - struct ftrace_probe_ops *ops; - unsigned long flags; - unsigned long ip; - void *data; - struct list_head free_list; -}; - struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; }; -struct ftrace_hash { - unsigned long size_bits; - struct hlist_head *buckets; - unsigned long count; - struct rcu_head rcu; +struct ftrace_func_probe { + struct ftrace_probe_ops *probe_ops; + struct ftrace_ops ops; + struct trace_array *tr; + struct list_head list; + void *data; + int ref; }; /* @@ -1192,26 +1185,24 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) +static __always_inline unsigned long +ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip) { - return !hash || !hash->count; + if (hash->size_bits > 0) + return hash_long(ip, hash->size_bits); + + return 0; } -static struct ftrace_func_entry * -ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +/* Only use this function if ftrace_hash_empty() has already been tested */ +static __always_inline struct ftrace_func_entry * +__ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { unsigned long key; struct ftrace_func_entry *entry; struct hlist_head *hhd; - if (ftrace_hash_empty(hash)) - return NULL; - - if (hash->size_bits > 0) - key = hash_long(ip, hash->size_bits); - else - key = 0; - + key = ftrace_hash_key(hash, ip); hhd = &hash->buckets[key]; hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { @@ -1221,17 +1212,32 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) return NULL; } +/** + * ftrace_lookup_ip - Test to see if an ip exists in an ftrace_hash + * @hash: The hash to look at + * @ip: The instruction pointer to test + * + * Search a given @hash to see if a given instruction pointer (@ip) + * exists in it. + * + * Returns the entry that holds the @ip if found. NULL otherwise. + */ +struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ + if (ftrace_hash_empty(hash)) + return NULL; + + return __ftrace_lookup_ip(hash, ip); +} + static void __add_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { struct hlist_head *hhd; unsigned long key; - if (hash->size_bits) - key = hash_long(entry->ip, hash->size_bits); - else - key = 0; - + key = ftrace_hash_key(hash, entry->ip); hhd = &hash->buckets[key]; hlist_add_head(&entry->hlist, hhd); hash->count++; @@ -1264,7 +1270,7 @@ static void remove_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { - hlist_del(&entry->hlist); + hlist_del_rcu(&entry->hlist); hash->count--; } @@ -1383,9 +1389,8 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, struct ftrace_hash *new_hash); -static int -ftrace_hash_move(struct ftrace_ops *ops, int enable, - struct ftrace_hash **dst, struct ftrace_hash *src) +static struct ftrace_hash * +__ftrace_hash_move(struct ftrace_hash *src) { struct ftrace_func_entry *entry; struct hlist_node *tn; @@ -1393,21 +1398,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash *new_hash; int size = src->count; int bits = 0; - int ret; int i; - /* Reject setting notrace hash on IPMODIFY ftrace_ops */ - if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) - return -EINVAL; - /* - * If the new source is empty, just free dst and assign it - * the empty_hash. + * If the new source is empty, just return the empty_hash. */ - if (!src->count) { - new_hash = EMPTY_HASH; - goto update; - } + if (!src->count) + return EMPTY_HASH; /* * Make the hash size about 1/2 the # found @@ -1421,7 +1418,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, new_hash = alloc_ftrace_hash(bits); if (!new_hash) - return -ENOMEM; + return NULL; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1432,7 +1429,24 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, } } -update: + return new_hash; +} + +static int +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) +{ + struct ftrace_hash *new_hash; + int ret; + + /* Reject setting notrace hash on IPMODIFY ftrace_ops */ + if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) + return -EINVAL; + + new_hash = __ftrace_hash_move(src); + if (!new_hash) + return -ENOMEM; + /* Make sure this can be applied if it is IPMODIFY ftrace_ops */ if (enable) { /* IPMODIFY should be updated only when filter_hash updating */ @@ -1466,9 +1480,9 @@ static bool hash_contains_ip(unsigned long ip, * notrace hash is considered not in the notrace hash. */ return (ftrace_hash_empty(hash->filter_hash) || - ftrace_lookup_ip(hash->filter_hash, ip)) && + __ftrace_lookup_ip(hash->filter_hash, ip)) && (ftrace_hash_empty(hash->notrace_hash) || - !ftrace_lookup_ip(hash->notrace_hash, ip)); + !__ftrace_lookup_ip(hash->notrace_hash, ip)); } /* @@ -2792,18 +2806,28 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * callers are done before leaving this function. * The same goes for freeing the per_cpu data of the per_cpu * ops. - * - * Again, normal synchronize_sched() is not good enough. - * We need to do a hard force of sched synchronization. - * This is because we use preempt_disable() to do RCU, but - * the function tracers can be called where RCU is not watching - * (like before user_exit()). We can not rely on the RCU - * infrastructure to do the synchronization, thus we must do it - * ourselves. */ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { + /* + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ schedule_on_each_cpu(ftrace_sync); + /* + * When the kernel is preeptive, tasks can be preempted + * while on a ftrace trampoline. Just scheduling a task on + * a CPU is not good enough to flush them. Calling + * synchornize_rcu_tasks() will wait for those tasks to + * execute and either schedule voluntarily or enter user space. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_tasks(); + arch_ftrace_trampoline_free(ops); if (ops->flags & FTRACE_OPS_FL_PER_CPU) @@ -2880,7 +2904,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) + !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) return 0; /* If in notrace hash, we ignore it too */ @@ -3040,34 +3064,63 @@ struct ftrace_iterator { struct ftrace_page *pg; struct dyn_ftrace *func; struct ftrace_func_probe *probe; + struct ftrace_func_entry *probe_entry; struct trace_parser parser; struct ftrace_hash *hash; struct ftrace_ops *ops; - int hidx; + int pidx; int idx; unsigned flags; }; static void * -t_hash_next(struct seq_file *m, loff_t *pos) +t_probe_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct trace_array *tr = iter->ops->private; + struct list_head *func_probes; + struct ftrace_hash *hash; + struct list_head *next; struct hlist_node *hnd = NULL; struct hlist_head *hhd; + int size; (*pos)++; iter->pos = *pos; - if (iter->probe) - hnd = &iter->probe->node; - retry: - if (iter->hidx >= FTRACE_FUNC_HASHSIZE) + if (!tr) + return NULL; + + func_probes = &tr->func_probes; + if (list_empty(func_probes)) return NULL; - hhd = &ftrace_func_hash[iter->hidx]; + if (!iter->probe) { + next = func_probes->next; + iter->probe = list_entry(next, struct ftrace_func_probe, list); + } + + if (iter->probe_entry) + hnd = &iter->probe_entry->hlist; + + hash = iter->probe->ops.func_hash->filter_hash; + size = 1 << hash->size_bits; + + retry: + if (iter->pidx >= size) { + if (iter->probe->list.next == func_probes) + return NULL; + next = iter->probe->list.next; + iter->probe = list_entry(next, struct ftrace_func_probe, list); + hash = iter->probe->ops.func_hash->filter_hash; + size = 1 << hash->size_bits; + iter->pidx = 0; + } + + hhd = &hash->buckets[iter->pidx]; if (hlist_empty(hhd)) { - iter->hidx++; + iter->pidx++; hnd = NULL; goto retry; } @@ -3077,7 +3130,7 @@ t_hash_next(struct seq_file *m, loff_t *pos) else { hnd = hnd->next; if (!hnd) { - iter->hidx++; + iter->pidx++; goto retry; } } @@ -3085,26 +3138,28 @@ t_hash_next(struct seq_file *m, loff_t *pos) if (WARN_ON_ONCE(!hnd)) return NULL; - iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + iter->probe_entry = hlist_entry(hnd, struct ftrace_func_entry, hlist); return iter; } -static void *t_hash_start(struct seq_file *m, loff_t *pos) +static void *t_probe_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; loff_t l; - if (!(iter->flags & FTRACE_ITER_DO_HASH)) + if (!(iter->flags & FTRACE_ITER_DO_PROBES)) return NULL; if (iter->func_pos > *pos) return NULL; - iter->hidx = 0; + iter->probe = NULL; + iter->probe_entry = NULL; + iter->pidx = 0; for (l = 0; l <= (*pos - iter->func_pos); ) { - p = t_hash_next(m, &l); + p = t_probe_next(m, &l); if (!p) break; } @@ -3112,50 +3167,42 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) return NULL; /* Only set this if we have an item */ - iter->flags |= FTRACE_ITER_HASH; + iter->flags |= FTRACE_ITER_PROBE; return iter; } static int -t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) +t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) { - struct ftrace_func_probe *rec; + struct ftrace_func_entry *probe_entry; + struct ftrace_probe_ops *probe_ops; + struct ftrace_func_probe *probe; - rec = iter->probe; - if (WARN_ON_ONCE(!rec)) + probe = iter->probe; + probe_entry = iter->probe_entry; + + if (WARN_ON_ONCE(!probe || !probe_entry)) return -EIO; - if (rec->ops->print) - return rec->ops->print(m, rec->ip, rec->ops, rec->data); + probe_ops = probe->probe_ops; - seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); + if (probe_ops->print) + return probe_ops->print(m, probe_entry->ip, probe_ops, probe->data); - if (rec->data) - seq_printf(m, ":%p", rec->data); - seq_putc(m, '\n'); + seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip, + (void *)probe_ops->func); return 0; } static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +t_func_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = iter->ops; struct dyn_ftrace *rec = NULL; - if (unlikely(ftrace_disabled)) - return NULL; - - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, pos); - (*pos)++; - iter->pos = iter->func_pos = *pos; - - if (iter->flags & FTRACE_ITER_PRINTALL) - return t_hash_start(m, pos); retry: if (iter->idx >= iter->pg->index) { @@ -3166,11 +3213,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if (((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || - - ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || + if (((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && + !ftrace_lookup_ip(iter->hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && !(rec->flags & FTRACE_FL_ENABLED))) { @@ -3181,24 +3225,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } if (!rec) - return t_hash_start(m, pos); + return NULL; + iter->pos = iter->func_pos = *pos; iter->func = rec; return iter; } +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + loff_t l = *pos; /* t_hash_start() must use original pos */ + void *ret; + + if (unlikely(ftrace_disabled)) + return NULL; + + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_next(m, pos); + + if (iter->flags & FTRACE_ITER_PRINTALL) { + /* next must increment pos, and t_probe_start does not */ + (*pos)++; + return t_probe_start(m, &l); + } + + ret = t_func_next(m, pos); + + if (!ret) + return t_probe_start(m, &l); + + return ret; +} + static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); } static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = iter->ops; void *p = NULL; loff_t l; @@ -3218,20 +3289,19 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if ((iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->func_hash->filter_hash)) || - (iter->flags & FTRACE_ITER_NOTRACE && - ftrace_hash_empty(ops->func_hash->notrace_hash))) { + if ((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && + ftrace_hash_empty(iter->hash)) { + iter->func_pos = 1; /* Account for the message */ if (*pos > 0) - return t_hash_start(m, pos); + return t_probe_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; /* reset in case of seek/pread */ - iter->flags &= ~FTRACE_ITER_HASH; + iter->flags &= ~FTRACE_ITER_PROBE; return iter; } - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_start(m, pos); + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_start(m, pos); /* * Unfortunately, we need to restart at ftrace_pages_start @@ -3241,13 +3311,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) iter->pg = ftrace_pages_start; iter->idx = 0; for (l = 0; l <= *pos; ) { - p = t_next(m, p, &l); + p = t_func_next(m, &l); if (!p) break; } if (!p) - return t_hash_start(m, pos); + return t_probe_start(m, pos); return iter; } @@ -3278,8 +3348,8 @@ static int t_show(struct seq_file *m, void *v) struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec; - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, iter); + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { if (iter->flags & FTRACE_ITER_NOTRACE) @@ -3340,12 +3410,13 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENODEV; iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); - if (iter) { - iter->pg = ftrace_pages_start; - iter->ops = &global_ops; - } + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->ops = &global_ops; - return iter ? 0 : -ENOMEM; + return 0; } static int @@ -3354,13 +3425,14 @@ ftrace_enabled_open(struct inode *inode, struct file *file) struct ftrace_iterator *iter; iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); - if (iter) { - iter->pg = ftrace_pages_start; - iter->flags = FTRACE_ITER_ENABLED; - iter->ops = &global_ops; - } + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; - return iter ? 0 : -ENOMEM; + return 0; } /** @@ -3425,7 +3497,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, ret = -ENOMEM; goto out_unlock; } - } + } else + iter->hash = hash; if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; @@ -3455,7 +3528,7 @@ ftrace_filter_open(struct inode *inode, struct file *file) struct ftrace_ops *ops = inode->i_private; return ftrace_regex_open(ops, - FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES, inode, file); } @@ -3558,22 +3631,20 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, /* blank module name to match all modules */ if (!mod_g->len) { /* blank module globbing: modname xor exclude_mod */ - if ((!exclude_mod) != (!modname)) + if (!exclude_mod != !modname) goto func_match; return 0; } - /* not matching the module */ - if (!modname || !mod_matches) { - if (exclude_mod) - goto func_match; - else - return 0; - } - - if (mod_matches && exclude_mod) + /* + * exclude_mod is set to trace everything but the given + * module. If it is set and the module matches, then + * return 0. If it is not set, and the module doesn't match + * also return 0. Otherwise, check the function to see if + * that matches. + */ + if (!mod_matches == !exclude_mod) return 0; - func_match: /* blank search means to match all funcs in the mod */ if (!func_g->len) @@ -3639,6 +3710,56 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) return match_records(hash, buff, len, NULL); } +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_ops_hash *old_hash) +{ + struct ftrace_ops *op; + + if (!ftrace_enabled) + return; + + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); + return; + } + + /* + * If this is the shared global_ops filter, then we need to + * check if there is another ops that shares it, is enabled. + * If so, we still need to run the modify code. + */ + if (ops->func_hash != &global_ops.local_hash) + return; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->func_hash == &global_ops.local_hash && + op->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); + /* Only need to do this once */ + return; + } + } while_for_each_ftrace_op(op); +} + +static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, + struct ftrace_hash **orig_hash, + struct ftrace_hash *hash, + int enable) +{ + struct ftrace_ops_hash old_hash_ops; + struct ftrace_hash *old_hash; + int ret; + + old_hash = *orig_hash; + old_hash_ops.filter_hash = ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; + ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret) { + ftrace_ops_update_code(ops, &old_hash_ops); + free_ftrace_hash_rcu(old_hash); + } + return ret; +} /* * We register the module command as a template to show others how @@ -3646,7 +3767,7 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) */ static int -ftrace_mod_callback(struct ftrace_hash *hash, +ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, char *func, char *cmd, char *module, int enable) { int ret; @@ -3680,16 +3801,11 @@ core_initcall(ftrace_mod_cmd_init); static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct ftrace_func_probe *entry; - struct hlist_head *hhd; - unsigned long key; - - key = hash_long(ip, FTRACE_HASH_BITS); - - hhd = &ftrace_func_hash[key]; + struct ftrace_probe_ops *probe_ops; + struct ftrace_func_probe *probe; - if (hlist_empty(hhd)) - return; + probe = container_of(op, struct ftrace_func_probe, ops); + probe_ops = probe->probe_ops; /* * Disable preemption for these calls to prevent a RCU grace @@ -3697,206 +3813,336 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu_notrace(entry, hhd, node) { - if (entry->ip == ip) - entry->ops->func(ip, parent_ip, &entry->data); - } + probe_ops->func(ip, parent_ip, probe->tr, probe_ops, probe->data); preempt_enable_notrace(); } -static struct ftrace_ops trace_probe_ops __read_mostly = -{ - .func = function_trace_probe_call, - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(trace_probe_ops) +struct ftrace_func_map { + struct ftrace_func_entry entry; + void *data; }; -static int ftrace_probe_registered; +struct ftrace_func_mapper { + struct ftrace_hash hash; +}; -static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) +/** + * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper + * + * Returns a ftrace_func_mapper descriptor that can be used to map ips to data. + */ +struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) { - int ret; - int i; + struct ftrace_hash *hash; - if (ftrace_probe_registered) { - /* still need to update the function call sites */ - if (ftrace_enabled) - ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, - old_hash); - return; - } + /* + * The mapper is simply a ftrace_hash, but since the entries + * in the hash are not ftrace_func_entry type, we define it + * as a separate structure. + */ + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + return (struct ftrace_func_mapper *)hash; +} - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - if (hhd->first) - break; - } - /* Nothing registered? */ - if (i == FTRACE_FUNC_HASHSIZE) - return; +/** + * ftrace_func_mapper_find_ip - Find some data mapped to an ip + * @mapper: The mapper that has the ip maps + * @ip: the instruction pointer to find the data for + * + * Returns the data mapped to @ip if found otherwise NULL. The return + * is actually the address of the mapper data pointer. The address is + * returned for use cases where the data is no bigger than a long, and + * the user can use the data pointer as its data instead of having to + * allocate more memory for the reference. + */ +void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, + unsigned long ip) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; - ret = ftrace_startup(&trace_probe_ops, 0); + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (!entry) + return NULL; - ftrace_probe_registered = 1; + map = (struct ftrace_func_map *)entry; + return &map->data; } -static void __disable_ftrace_function_probe(void) +/** + * ftrace_func_mapper_add_ip - Map some data to an ip + * @mapper: The mapper that has the ip maps + * @ip: The instruction pointer address to map @data to + * @data: The data to map to @ip + * + * Returns 0 on succes otherwise an error. + */ +int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, + unsigned long ip, void *data) { - int i; + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; - if (!ftrace_probe_registered) - return; + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (entry) + return -EBUSY; - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - if (hhd->first) - return; - } + map = kmalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; - /* no more funcs left */ - ftrace_shutdown(&trace_probe_ops, 0); + map->entry.ip = ip; + map->data = data; - ftrace_probe_registered = 0; -} + __add_hash_entry(&mapper->hash, &map->entry); + return 0; +} -static void ftrace_free_entry(struct ftrace_func_probe *entry) +/** + * ftrace_func_mapper_remove_ip - Remove an ip from the mapping + * @mapper: The mapper that has the ip maps + * @ip: The instruction pointer address to remove the data from + * + * Returns the data if it is found, otherwise NULL. + * Note, if the data pointer is used as the data itself, (see + * ftrace_func_mapper_find_ip(), then the return value may be meaningless, + * if the data pointer was set to zero. + */ +void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, + unsigned long ip) { - if (entry->ops->free) - entry->ops->free(entry->ops, entry->ip, &entry->data); + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + void *data; + + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (!entry) + return NULL; + + map = (struct ftrace_func_map *)entry; + data = map->data; + + remove_hash_entry(&mapper->hash, entry); kfree(entry); + + return data; +} + +/** + * free_ftrace_func_mapper - free a mapping of ips and data + * @mapper: The mapper that has the ip maps + * @free_func: A function to be called on each data item. + * + * This is used to free the function mapper. The @free_func is optional + * and can be used if the data needs to be freed as well. + */ +void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, + ftrace_mapper_func free_func) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + struct hlist_head *hhd; + int size = 1 << mapper->hash.size_bits; + int i; + + if (free_func && mapper->hash.count) { + for (i = 0; i < size; i++) { + hhd = &mapper->hash.buckets[i]; + hlist_for_each_entry(entry, hhd, hlist) { + map = (struct ftrace_func_map *)entry; + free_func(map); + } + } + } + free_ftrace_hash(&mapper->hash); +} + +static void release_probe(struct ftrace_func_probe *probe) +{ + struct ftrace_probe_ops *probe_ops; + + mutex_lock(&ftrace_lock); + + WARN_ON(probe->ref <= 0); + + /* Subtract the ref that was used to protect this instance */ + probe->ref--; + + if (!probe->ref) { + probe_ops = probe->probe_ops; + /* + * Sending zero as ip tells probe_ops to free + * the probe->data itself + */ + if (probe_ops->free) + probe_ops->free(probe_ops, probe->tr, 0, probe->data); + list_del(&probe->list); + kfree(probe); + } + mutex_unlock(&ftrace_lock); +} + +static void acquire_probe_locked(struct ftrace_func_probe *probe) +{ + /* + * Add one ref to keep it from being freed when releasing the + * ftrace_lock mutex. + */ + probe->ref++; } int -register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data) +register_ftrace_function_probe(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *probe_ops, + void *data) { - struct ftrace_ops_hash old_hash_ops; - struct ftrace_func_probe *entry; - struct ftrace_glob func_g; - struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; - struct ftrace_hash *old_hash = *orig_hash; + struct ftrace_func_entry *entry; + struct ftrace_func_probe *probe; + struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; struct ftrace_hash *hash; - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int not; - unsigned long key; int count = 0; + int size; int ret; + int i; - func_g.type = filter_parse_regex(glob, strlen(glob), - &func_g.search, ¬); - func_g.len = strlen(func_g.search); - - /* we do not support '!' for function probes */ - if (WARN_ON(not)) + if (WARN_ON(!tr)) return -EINVAL; - mutex_lock(&trace_probe_ops.func_hash->regex_lock); + /* We do not support '!' for function probes */ + if (WARN_ON(glob[0] == '!')) + return -EINVAL; - old_hash_ops.filter_hash = old_hash; - /* Probes only have filters */ - old_hash_ops.notrace_hash = NULL; - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); - if (!hash) { - count = -ENOMEM; - goto out; + mutex_lock(&ftrace_lock); + /* Check if the probe_ops is already registered */ + list_for_each_entry(probe, &tr->func_probes, list) { + if (probe->probe_ops == probe_ops) + break; } - - if (unlikely(ftrace_disabled)) { - count = -ENODEV; - goto out; + if (&probe->list == &tr->func_probes) { + probe = kzalloc(sizeof(*probe), GFP_KERNEL); + if (!probe) { + mutex_unlock(&ftrace_lock); + return -ENOMEM; + } + probe->probe_ops = probe_ops; + probe->ops.func = function_trace_probe_call; + probe->tr = tr; + ftrace_ops_init(&probe->ops); + list_add(&probe->list, &tr->func_probes); } - mutex_lock(&ftrace_lock); + acquire_probe_locked(probe); - do_for_each_ftrace_rec(pg, rec) { + mutex_unlock(&ftrace_lock); - if (rec->flags & FTRACE_FL_DISABLED) - continue; + mutex_lock(&probe->ops.func_hash->regex_lock); - if (!ftrace_match_record(rec, &func_g, NULL, 0)) - continue; + orig_hash = &probe->ops.func_hash->filter_hash; + old_hash = *orig_hash; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) { - /* If we did not process any, then return error */ - if (!count) - count = -ENOMEM; - goto out_unlock; - } + ret = ftrace_match_records(hash, glob, strlen(glob)); - count++; + /* Nothing found? */ + if (!ret) + ret = -EINVAL; - entry->data = data; + if (ret < 0) + goto out; - /* - * The caller might want to do something special - * for each function we find. We call the callback - * to give the caller an opportunity to do so. - */ - if (ops->init) { - if (ops->init(ops, rec->ip, &entry->data) < 0) { - /* caller does not like this func */ - kfree(entry); + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_lookup_ip(old_hash, entry->ip)) continue; + /* + * The caller might want to do something special + * for each function we find. We call the callback + * to give the caller an opportunity to do so. + */ + if (probe_ops->init) { + ret = probe_ops->init(probe_ops, tr, + entry->ip, data, + &probe->data); + if (ret < 0) { + if (probe_ops->free && count) + probe_ops->free(probe_ops, tr, + 0, probe->data); + probe->data = NULL; + goto out; + } } + count++; } + } - ret = enter_record(hash, rec, 0); - if (ret < 0) { - kfree(entry); - count = ret; - goto out_unlock; - } - - entry->ops = ops; - entry->ip = rec->ip; - - key = hash_long(entry->ip, FTRACE_HASH_BITS); - hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); + mutex_lock(&ftrace_lock); - } while_for_each_ftrace_rec(); + if (!count) { + /* Nothing was added? */ + ret = -EINVAL; + goto out_unlock; + } - ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, + hash, 1); + if (ret < 0) + goto err_unlock; - __enable_ftrace_function_probe(&old_hash_ops); + /* One ref for each new function traced */ + probe->ref += count; - if (!ret) - free_ftrace_hash_rcu(old_hash); - else - count = ret; + if (!(probe->ops.flags & FTRACE_OPS_FL_ENABLED)) + ret = ftrace_startup(&probe->ops, 0); out_unlock: mutex_unlock(&ftrace_lock); + + if (!ret) + ret = count; out: - mutex_unlock(&trace_probe_ops.func_hash->regex_lock); + mutex_unlock(&probe->ops.func_hash->regex_lock); free_ftrace_hash(hash); - return count; -} + release_probe(probe); -enum { - PROBE_TEST_FUNC = 1, - PROBE_TEST_DATA = 2 -}; + return ret; -static void -__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data, int flags) + err_unlock: + if (!probe_ops->free || !count) + goto out_unlock; + + /* Failed to do the move, need to call the free functions */ + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_lookup_ip(old_hash, entry->ip)) + continue; + probe_ops->free(probe_ops, tr, entry->ip, probe->data); + } + } + goto out_unlock; +} + +int +unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *probe_ops) { - struct ftrace_func_entry *rec_entry; - struct ftrace_func_probe *entry; - struct ftrace_func_probe *p; + struct ftrace_ops_hash old_hash_ops; + struct ftrace_func_entry *entry; + struct ftrace_func_probe *probe; struct ftrace_glob func_g; - struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; - struct ftrace_hash *old_hash = *orig_hash; - struct list_head free_list; - struct ftrace_hash *hash; + struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; + struct ftrace_hash *hash = NULL; struct hlist_node *tmp; + struct hlist_head hhd; char str[KSYM_SYMBOL_LEN]; - int i, ret; + int count = 0; + int i, ret = -ENODEV; + int size; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) func_g.search = NULL; @@ -3910,86 +4156,104 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, /* we do not support '!' for function probes */ if (WARN_ON(not)) - return; + return -EINVAL; + } + + mutex_lock(&ftrace_lock); + /* Check if the probe_ops is already registered */ + list_for_each_entry(probe, &tr->func_probes, list) { + if (probe->probe_ops == probe_ops) + break; } + if (&probe->list == &tr->func_probes) + goto err_unlock_ftrace; - mutex_lock(&trace_probe_ops.func_hash->regex_lock); + ret = -EINVAL; + if (!(probe->ops.flags & FTRACE_OPS_FL_INITIALIZED)) + goto err_unlock_ftrace; - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); - if (!hash) - /* Hmm, should report this somehow */ - goto out_unlock; + acquire_probe_locked(probe); - INIT_LIST_HEAD(&free_list); + mutex_unlock(&ftrace_lock); - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; + mutex_lock(&probe->ops.func_hash->regex_lock); - hlist_for_each_entry_safe(entry, tmp, hhd, node) { + orig_hash = &probe->ops.func_hash->filter_hash; + old_hash = *orig_hash; - /* break up if statements for readability */ - if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) - continue; + if (ftrace_hash_empty(old_hash)) + goto out_unlock; - if ((flags & PROBE_TEST_DATA) && entry->data != data) - continue; + old_hash_ops.filter_hash = old_hash; + /* Probes only have filters */ + old_hash_ops.notrace_hash = NULL; + + ret = -ENOMEM; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + if (!hash) + goto out_unlock; + + INIT_HLIST_HEAD(&hhd); + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { - /* do this last, since it is the most expensive */ if (func_g.search) { kallsyms_lookup(entry->ip, NULL, NULL, NULL, str); if (!ftrace_match(str, &func_g)) continue; } - - rec_entry = ftrace_lookup_ip(hash, entry->ip); - /* It is possible more than one entry had this ip */ - if (rec_entry) - free_hash_entry(hash, rec_entry); - - hlist_del_rcu(&entry->node); - list_add(&entry->free_list, &free_list); + count++; + remove_hash_entry(hash, entry); + hlist_add_head(&entry->hlist, &hhd); } } + + /* Nothing found? */ + if (!count) { + ret = -EINVAL; + goto out_unlock; + } + mutex_lock(&ftrace_lock); - __disable_ftrace_function_probe(); - /* - * Remove after the disable is called. Otherwise, if the last - * probe is removed, a null hash means *all enabled*. - */ - ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + WARN_ON(probe->ref < count); + + probe->ref -= count; + + if (ftrace_hash_empty(hash)) + ftrace_shutdown(&probe->ops, 0); + + ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, + hash, 1); + + /* still need to update the function call sites */ + if (ftrace_enabled && !ftrace_hash_empty(hash)) + ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS, + &old_hash_ops); synchronize_sched(); - if (!ret) - free_ftrace_hash_rcu(old_hash); - list_for_each_entry_safe(entry, p, &free_list, free_list) { - list_del(&entry->free_list); - ftrace_free_entry(entry); + hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { + hlist_del(&entry->hlist); + if (probe_ops->free) + probe_ops->free(probe_ops, tr, entry->ip, probe->data); + kfree(entry); } mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&trace_probe_ops.func_hash->regex_lock); + mutex_unlock(&probe->ops.func_hash->regex_lock); free_ftrace_hash(hash); -} -void -unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data) -{ - __unregister_ftrace_function_probe(glob, ops, data, - PROBE_TEST_FUNC | PROBE_TEST_DATA); -} + release_probe(probe); -void -unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) -{ - __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); -} + return ret; -void unregister_ftrace_function_probe_all(char *glob) -{ - __unregister_ftrace_function_probe(glob, NULL, NULL, 0); + err_unlock_ftrace: + mutex_unlock(&ftrace_lock); + return ret; } static LIST_HEAD(ftrace_commands); @@ -4041,9 +4305,11 @@ __init int unregister_ftrace_command(struct ftrace_func_command *cmd) return ret; } -static int ftrace_process_regex(struct ftrace_hash *hash, +static int ftrace_process_regex(struct ftrace_iterator *iter, char *buff, int len, int enable) { + struct ftrace_hash *hash = iter->hash; + struct trace_array *tr = iter->ops->private; char *func, *command, *next = buff; struct ftrace_func_command *p; int ret = -EINVAL; @@ -4063,10 +4329,13 @@ static int ftrace_process_regex(struct ftrace_hash *hash, command = strsep(&next, ":"); + if (WARN_ON_ONCE(!tr)) + return -EINVAL; + mutex_lock(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(hash, func, command, next, enable); + ret = p->func(tr, hash, func, command, next, enable); goto out_unlock; } } @@ -4103,7 +4372,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (read >= 0 && trace_parser_loaded(parser) && !trace_parser_cont(parser)) { - ret = ftrace_process_regex(iter->hash, parser->buffer, + ret = ftrace_process_regex(iter, parser->buffer, parser->idx, enable); trace_parser_clear(parser); if (ret < 0) @@ -4148,44 +4417,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } -static void ftrace_ops_update_code(struct ftrace_ops *ops, - struct ftrace_ops_hash *old_hash) -{ - struct ftrace_ops *op; - - if (!ftrace_enabled) - return; - - if (ops->flags & FTRACE_OPS_FL_ENABLED) { - ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); - return; - } - - /* - * If this is the shared global_ops filter, then we need to - * check if there is another ops that shares it, is enabled. - * If so, we still need to run the modify code. - */ - if (ops->func_hash != &global_ops.local_hash) - return; - - do_for_each_ftrace_op(op, ftrace_ops_list) { - if (op->func_hash == &global_ops.local_hash && - op->flags & FTRACE_OPS_FL_ENABLED) { - ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); - /* Only need to do this once */ - return; - } - } while_for_each_ftrace_op(op); -} - static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; - struct ftrace_ops_hash old_hash_ops; - struct ftrace_hash *old_hash; struct ftrace_hash *hash; int ret; @@ -4220,14 +4456,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, } mutex_lock(&ftrace_lock); - old_hash = *orig_hash; - old_hash_ops.filter_hash = ops->func_hash->filter_hash; - old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; - ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret) { - ftrace_ops_update_code(ops, &old_hash_ops); - free_ftrace_hash_rcu(old_hash); - } + ret = ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable); mutex_unlock(&ftrace_lock); out_regex_unlock: @@ -4382,7 +4611,7 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); +static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); static unsigned long save_global_trampoline; static unsigned long save_global_flags; @@ -4401,26 +4630,38 @@ static int __init set_graph_notrace_function(char *str) } __setup("ftrace_graph_notrace=", set_graph_notrace_function); +static int __init set_graph_max_depth_function(char *str) +{ + if (!str) + return 0; + fgraph_max_depth = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("ftrace_graph_max_depth=", set_graph_max_depth_function); + static void __init set_ftrace_early_graph(char *buf, int enable) { int ret; char *func; - unsigned long *table = ftrace_graph_funcs; - int *count = &ftrace_graph_count; + struct ftrace_hash *hash; - if (!enable) { - table = ftrace_graph_notrace_funcs; - count = &ftrace_graph_notrace_count; - } + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + if (WARN_ON(!hash)) + return; while (buf) { func = strsep(&buf, ","); /* we allow only one expression at a time */ - ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func); + ret = ftrace_graph_set_hash(hash, func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); } + + if (enable) + ftrace_graph_hash = hash; + else + ftrace_graph_notrace_hash = hash; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -4454,10 +4695,8 @@ static void __init set_ftrace_early_filters(void) int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; - struct ftrace_ops_hash old_hash_ops; struct ftrace_iterator *iter; struct ftrace_hash **orig_hash; - struct ftrace_hash *old_hash; struct trace_parser *parser; int filter_hash; int ret; @@ -4487,16 +4726,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); - old_hash = *orig_hash; - old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash; - old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash; - ret = ftrace_hash_move(iter->ops, filter_hash, - orig_hash, iter->hash); - if (!ret) { - ftrace_ops_update_code(iter->ops, &old_hash_ops); - free_ftrace_hash_rcu(old_hash); - } + ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash, + iter->hash, filter_hash); mutex_unlock(&ftrace_lock); + } else { + /* For read only, the hash is the ops hash */ + iter->hash = NULL; } mutex_unlock(&iter->ops->func_hash->regex_lock); @@ -4540,26 +4775,55 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); -int ftrace_graph_count; -int ftrace_graph_notrace_count; -unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; -unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; +struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH; +struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH; + +enum graph_filter_type { + GRAPH_FILTER_NOTRACE = 0, + GRAPH_FILTER_FUNCTION, +}; + +#define FTRACE_GRAPH_EMPTY ((void *)1) struct ftrace_graph_data { - unsigned long *table; - size_t size; - int *count; - const struct seq_operations *seq_ops; + struct ftrace_hash *hash; + struct ftrace_func_entry *entry; + int idx; /* for hash table iteration */ + enum graph_filter_type type; + struct ftrace_hash *new_hash; + const struct seq_operations *seq_ops; + struct trace_parser parser; }; static void * __g_next(struct seq_file *m, loff_t *pos) { struct ftrace_graph_data *fgd = m->private; + struct ftrace_func_entry *entry = fgd->entry; + struct hlist_head *head; + int i, idx = fgd->idx; - if (*pos >= *fgd->count) + if (*pos >= fgd->hash->count) return NULL; - return &fgd->table[*pos]; + + if (entry) { + hlist_for_each_entry_continue(entry, hlist) { + fgd->entry = entry; + return entry; + } + + idx++; + } + + for (i = idx; i < 1 << fgd->hash->size_bits; i++) { + head = &fgd->hash->buckets[i]; + hlist_for_each_entry(entry, head, hlist) { + fgd->entry = entry; + fgd->idx = i; + return entry; + } + } + return NULL; } static void * @@ -4575,10 +4839,19 @@ static void *g_start(struct seq_file *m, loff_t *pos) mutex_lock(&graph_lock); + if (fgd->type == GRAPH_FILTER_FUNCTION) + fgd->hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + else + fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + /* Nothing, tell g_show to print all functions are enabled */ - if (!*fgd->count && !*pos) - return (void *)1; + if (ftrace_hash_empty(fgd->hash) && !*pos) + return FTRACE_GRAPH_EMPTY; + fgd->idx = 0; + fgd->entry = NULL; return __g_next(m, pos); } @@ -4589,22 +4862,22 @@ static void g_stop(struct seq_file *m, void *p) static int g_show(struct seq_file *m, void *v) { - unsigned long *ptr = v; + struct ftrace_func_entry *entry = v; - if (!ptr) + if (!entry) return 0; - if (ptr == (unsigned long *)1) { + if (entry == FTRACE_GRAPH_EMPTY) { struct ftrace_graph_data *fgd = m->private; - if (fgd->table == ftrace_graph_funcs) + if (fgd->type == GRAPH_FILTER_FUNCTION) seq_puts(m, "#### all functions enabled ####\n"); else seq_puts(m, "#### no functions disabled ####\n"); return 0; } - seq_printf(m, "%ps\n", (void *)*ptr); + seq_printf(m, "%ps\n", (void *)entry->ip); return 0; } @@ -4621,24 +4894,51 @@ __ftrace_graph_open(struct inode *inode, struct file *file, struct ftrace_graph_data *fgd) { int ret = 0; + struct ftrace_hash *new_hash = NULL; - mutex_lock(&graph_lock); - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) { - *fgd->count = 0; - memset(fgd->table, 0, fgd->size * sizeof(*fgd->table)); + if (file->f_mode & FMODE_WRITE) { + const int size_bits = FTRACE_HASH_DEFAULT_BITS; + + if (trace_parser_get_init(&fgd->parser, FTRACE_BUFF_MAX)) + return -ENOMEM; + + if (file->f_flags & O_TRUNC) + new_hash = alloc_ftrace_hash(size_bits); + else + new_hash = alloc_and_copy_ftrace_hash(size_bits, + fgd->hash); + if (!new_hash) { + ret = -ENOMEM; + goto out; + } } - mutex_unlock(&graph_lock); if (file->f_mode & FMODE_READ) { - ret = seq_open(file, fgd->seq_ops); + ret = seq_open(file, &ftrace_graph_seq_ops); if (!ret) { struct seq_file *m = file->private_data; m->private = fgd; + } else { + /* Failed */ + free_ftrace_hash(new_hash); + new_hash = NULL; } } else file->private_data = fgd; +out: + if (ret < 0 && file->f_mode & FMODE_WRITE) + trace_parser_put(&fgd->parser); + + fgd->new_hash = new_hash; + + /* + * All uses of fgd->hash must be taken with the graph_lock + * held. The graph_lock is going to be released, so force + * fgd->hash to be reinitialized when it is taken again. + */ + fgd->hash = NULL; + return ret; } @@ -4646,6 +4946,7 @@ static int ftrace_graph_open(struct inode *inode, struct file *file) { struct ftrace_graph_data *fgd; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; @@ -4654,18 +4955,26 @@ ftrace_graph_open(struct inode *inode, struct file *file) if (fgd == NULL) return -ENOMEM; - fgd->table = ftrace_graph_funcs; - fgd->size = FTRACE_GRAPH_MAX_FUNCS; - fgd->count = &ftrace_graph_count; + mutex_lock(&graph_lock); + + fgd->hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + fgd->type = GRAPH_FILTER_FUNCTION; fgd->seq_ops = &ftrace_graph_seq_ops; - return __ftrace_graph_open(inode, file, fgd); + ret = __ftrace_graph_open(inode, file, fgd); + if (ret < 0) + kfree(fgd); + + mutex_unlock(&graph_lock); + return ret; } static int ftrace_graph_notrace_open(struct inode *inode, struct file *file) { struct ftrace_graph_data *fgd; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; @@ -4674,45 +4983,97 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file) if (fgd == NULL) return -ENOMEM; - fgd->table = ftrace_graph_notrace_funcs; - fgd->size = FTRACE_GRAPH_MAX_FUNCS; - fgd->count = &ftrace_graph_notrace_count; + mutex_lock(&graph_lock); + + fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + fgd->type = GRAPH_FILTER_NOTRACE; fgd->seq_ops = &ftrace_graph_seq_ops; - return __ftrace_graph_open(inode, file, fgd); + ret = __ftrace_graph_open(inode, file, fgd); + if (ret < 0) + kfree(fgd); + + mutex_unlock(&graph_lock); + return ret; } static int ftrace_graph_release(struct inode *inode, struct file *file) { + struct ftrace_graph_data *fgd; + struct ftrace_hash *old_hash, *new_hash; + struct trace_parser *parser; + int ret = 0; + if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; - kfree(m->private); + fgd = m->private; seq_release(inode, file); } else { - kfree(file->private_data); + fgd = file->private_data; } - return 0; + + if (file->f_mode & FMODE_WRITE) { + + parser = &fgd->parser; + + if (trace_parser_loaded((parser))) { + parser->buffer[parser->idx] = 0; + ret = ftrace_graph_set_hash(fgd->new_hash, + parser->buffer); + } + + trace_parser_put(parser); + + new_hash = __ftrace_hash_move(fgd->new_hash); + if (!new_hash) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&graph_lock); + + if (fgd->type == GRAPH_FILTER_FUNCTION) { + old_hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + rcu_assign_pointer(ftrace_graph_hash, new_hash); + } else { + old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash); + } + + mutex_unlock(&graph_lock); + + /* Wait till all users are no longer using the old hash */ + synchronize_sched(); + + free_ftrace_hash(old_hash); + } + + out: + kfree(fgd->new_hash); + kfree(fgd); + + return ret; } static int -ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) +ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) { struct ftrace_glob func_g; struct dyn_ftrace *rec; struct ftrace_page *pg; + struct ftrace_func_entry *entry; int fail = 1; int not; - bool exists; - int i; /* decode regex */ func_g.type = filter_parse_regex(buffer, strlen(buffer), &func_g.search, ¬); - if (!not && *idx >= size) - return -EBUSY; func_g.len = strlen(func_g.search); @@ -4729,26 +5090,18 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) continue; if (ftrace_match_record(rec, &func_g, NULL, 0)) { - /* if it is in the array */ - exists = false; - for (i = 0; i < *idx; i++) { - if (array[i] == rec->ip) { - exists = true; - break; - } - } + entry = ftrace_lookup_ip(hash, rec->ip); if (!not) { fail = 0; - if (!exists) { - array[(*idx)++] = rec->ip; - if (*idx >= size) - goto out; - } + + if (entry) + continue; + if (add_hash_entry(hash, rec->ip) < 0) + goto out; } else { - if (exists) { - array[i] = array[--(*idx)]; - array[*idx] = 0; + if (entry) { + free_hash_entry(hash, entry); fail = 0; } } @@ -4767,35 +5120,34 @@ static ssize_t ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_parser parser; ssize_t read, ret = 0; struct ftrace_graph_data *fgd = file->private_data; + struct trace_parser *parser; if (!cnt) return 0; - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) - return -ENOMEM; - - read = trace_get_user(&parser, ubuf, cnt, ppos); + /* Read mode uses seq functions */ + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + fgd = m->private; + } - if (read >= 0 && trace_parser_loaded((&parser))) { - parser.buffer[parser.idx] = 0; + parser = &fgd->parser; - mutex_lock(&graph_lock); + read = trace_get_user(parser, ubuf, cnt, ppos); - /* we allow only one expression at a time */ - ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, - parser.buffer); + if (read >= 0 && trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { - mutex_unlock(&graph_lock); + ret = ftrace_graph_set_hash(fgd->new_hash, + parser->buffer); + trace_parser_clear(parser); } if (!ret) ret = read; - trace_parser_put(&parser); - return ret; } @@ -5118,6 +5470,50 @@ void ftrace_module_init(struct module *mod) } #endif /* CONFIG_MODULES */ +void __init ftrace_free_init_mem(void) +{ + unsigned long start = (unsigned long)(&__init_begin); + unsigned long end = (unsigned long)(&__init_end); + struct ftrace_page **last_pg = &ftrace_pages_start; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + struct dyn_ftrace key; + int order; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + mutex_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + again: + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (!rec) + continue; + pg->index--; + if (!pg->index) { + *last_pg = pg->next; + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + kfree(pg); + pg = container_of(last_pg, struct ftrace_page, next); + if (!(*last_pg)) + ftrace_pages = pg; + continue; + } + memmove(rec, rec + 1, + (pg->index - (rec - pg->records)) * sizeof(*rec)); + /* More than one function may be in this block */ + goto again; + } + mutex_unlock(&ftrace_lock); +} + void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; @@ -5160,25 +5556,13 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) static void ftrace_update_trampoline(struct ftrace_ops *ops) { - -/* - * Currently there's no safe way to free a trampoline when the kernel - * is configured with PREEMPT. That is because a task could be preempted - * when it jumped to the trampoline, it may be preempted for a long time - * depending on the system load, and currently there's no way to know - * when it will be off the trampoline. If the trampoline is freed - * too early, when the task runs again, it will be executing on freed - * memory and crash. - */ -#ifdef CONFIG_PREEMPT - /* Currently, only non dynamic ops can have a trampoline */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - return; -#endif - arch_ftrace_update_trampoline(ops); } +void ftrace_init_trace_array(struct trace_array *tr) +{ + INIT_LIST_HEAD(&tr->func_probes); +} #else static struct ftrace_ops global_ops = { @@ -5233,6 +5617,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr) { tr->ops = &global_ops; tr->ops->private = tr; + ftrace_init_trace_array(tr); } void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) @@ -5357,7 +5742,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, * Normally the mcount trampoline will call the ops->func, but there * are times that it should not. For example, if the ops does not * have its own recursion protection, then it should call the - * ftrace_ops_recurs_func() instead. + * ftrace_ops_assist_func() instead. * * Returns the function that the trampoline should call for @ops. */ @@ -5387,6 +5772,43 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, trace_ignore_this_task(pid_list, next)); } +static void +ftrace_pid_follow_sched_process_fork(void *data, + struct task_struct *self, + struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->function_pids); + trace_filter_add_remove_task(pid_list, self, task); +} + +static void +ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->function_pids); + trace_filter_add_remove_task(pid_list, NULL, task); +} + +void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) +{ + if (enable) { + register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, + tr); + register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + tr); + } else { + unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, + tr); + unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + tr); + } +} + static void clear_ftrace_pids(struct trace_array *tr) { struct trace_pid_list *pid_list; @@ -5410,6 +5832,15 @@ static void clear_ftrace_pids(struct trace_array *tr) trace_free_pid_list(pid_list); } +void ftrace_clear_pids(struct trace_array *tr) +{ + mutex_lock(&ftrace_lock); + + clear_ftrace_pids(tr); + + mutex_unlock(&ftrace_lock); +} + static void ftrace_pid_reset(struct trace_array *tr) { mutex_lock(&ftrace_lock); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a85739efcc30..4ae268e687fe 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6,6 +6,7 @@ #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> +#include <linux/sched/clock.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/irq_work.h> @@ -437,6 +438,7 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; + struct buffer_data_page *free_page; unsigned long nr_pages; unsigned int current_context; struct list_head *pages; @@ -3404,11 +3406,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); int ring_buffer_iter_empty(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *reader; + struct buffer_page *head_page; + struct buffer_page *commit_page; + unsigned commit; cpu_buffer = iter->cpu_buffer; - return iter->head_page == cpu_buffer->commit_page && - iter->head == rb_commit_index(cpu_buffer); + /* Remember, trace recording is off when iterator is in use */ + reader = cpu_buffer->reader_page; + head_page = cpu_buffer->head_page; + commit_page = cpu_buffer->commit_page; + commit = rb_page_commit(commit_page); + + return ((iter->head_page == commit_page && iter->head == commit) || + (iter->head_page == reader && commit_page == head_page && + head_page->read == commit && + iter->head == rb_page_commit(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); @@ -4376,9 +4390,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { - struct buffer_data_page *bpage; + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct buffer_data_page *bpage = NULL; + unsigned long flags; struct page *page; + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (cpu_buffer->free_page) { + bpage = cpu_buffer->free_page; + cpu_buffer->free_page = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + if (bpage) + goto out; + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) @@ -4386,6 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) bpage = page_address(page); + out: rb_init_page(bpage); return bpage; @@ -4395,13 +4426,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for + * @cpu: the cpu buffer the page came from * @data: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ -void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) +void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) { - free_page((unsigned long)data); + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct buffer_data_page *bpage = data; + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (!cpu_buffer->free_page) { + cpu_buffer->free_page = bpage; + bpage = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + free_page((unsigned long)bpage); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); @@ -4825,9 +4872,9 @@ static __init int test_ringbuffer(void) rb_data[cpu].cnt = cpu; rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], "rbtester/%d", cpu); - if (WARN_ON(!rb_threads[cpu])) { + if (WARN_ON(IS_ERR(rb_threads[cpu]))) { pr_cont("FAILED\n"); - ret = -1; + ret = PTR_ERR(rb_threads[cpu]); goto out_free; } @@ -4837,9 +4884,9 @@ static __init int test_ringbuffer(void) /* Now create the rb hammer! */ rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); - if (WARN_ON(!rb_hammer)) { + if (WARN_ON(IS_ERR(rb_hammer))) { pr_cont("FAILED\n"); - ret = -1; + ret = PTR_ERR(rb_hammer); goto out_free; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 6df9a83e20d7..9fbcaf567886 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -6,6 +6,7 @@ #include <linux/ring_buffer.h> #include <linux/completion.h> #include <linux/kthread.h> +#include <uapi/linux/sched/types.h> #include <linux/module.h> #include <linux/ktime.h> #include <asm/local.h> @@ -170,7 +171,7 @@ static enum event_status read_page(int cpu) } } } - ring_buffer_free_read_page(buffer, bpage); + ring_buffer_free_read_page(buffer, cpu, bpage); if (ret < 0) return EVENT_DROPPED; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d7449783987a..c4536c449021 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -257,19 +257,11 @@ unsigned long long ns2usecs(u64 nsec) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - TRACE_ITER_EVENT_FORK + (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK) /* - * The global_trace is the descriptor that holds the tracing - * buffers for the live tracing. For each CPU, it contains - * a link list of pages that will store trace entries. The - * page descriptor of the pages in the memory is used to hold - * the link list by linking the lru item in the page descriptor - * to each of the pages in the buffer per CPU. - * - * For each active CPU there is a data field that holds the - * pages for the buffer for that CPU. Each CPU has the same number - * of pages allocated for its buffer. + * The global_trace is the descriptor that holds the top-level tracing + * buffers for the live tracing. */ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, @@ -765,7 +757,7 @@ __trace_buffer_lock_reserve(struct ring_buffer *buffer, return event; } -static void tracer_tracing_on(struct trace_array *tr) +void tracer_tracing_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_on(tr->trace_buffer.buffer); @@ -902,23 +894,8 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -/** - * trace_snapshot - take a snapshot of the current buffer. - * - * This causes a swap between the snapshot buffer and the current live - * tracing buffer. You can use this to take snapshots of the live - * trace when some condition is triggered, but continue to trace. - * - * Note, make sure to allocate the snapshot with either - * a tracing_snapshot_alloc(), or by doing it manually - * with: echo 1 > /sys/kernel/debug/tracing/snapshot - * - * If the snapshot buffer is not allocated, it will stop tracing. - * Basically making a permanent snapshot. - */ -void tracing_snapshot(void) +static void tracing_snapshot_instance(struct trace_array *tr) { - struct trace_array *tr = &global_trace; struct tracer *tracer = tr->current_trace; unsigned long flags; @@ -946,6 +923,27 @@ void tracing_snapshot(void) update_max_tr(tr, current, smp_processor_id()); local_irq_restore(flags); } + +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ + struct trace_array *tr = &global_trace; + + tracing_snapshot_instance(tr); +} EXPORT_SYMBOL_GPL(tracing_snapshot); static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, @@ -1047,7 +1045,7 @@ void tracing_snapshot_alloc(void) EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); #endif /* CONFIG_TRACER_SNAPSHOT */ -static void tracer_tracing_off(struct trace_array *tr) +void tracer_tracing_off(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_off(tr->trace_buffer.buffer); @@ -1193,6 +1191,7 @@ int trace_parser_get_init(struct trace_parser *parser, int size) void trace_parser_put(struct trace_parser *parser) { kfree(parser->buffer); + parser->buffer = NULL; } /* @@ -1431,6 +1430,28 @@ static int wait_on_pipe(struct trace_iterator *iter, bool full) } #ifdef CONFIG_FTRACE_STARTUP_TEST +static bool selftests_can_run; + +struct trace_selftests { + struct list_head list; + struct tracer *type; +}; + +static LIST_HEAD(postponed_selftests); + +static int save_selftest(struct tracer *type) +{ + struct trace_selftests *selftest; + + selftest = kmalloc(sizeof(*selftest), GFP_KERNEL); + if (!selftest) + return -ENOMEM; + + selftest->type = type; + list_add(&selftest->list, &postponed_selftests); + return 0; +} + static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; @@ -1441,6 +1462,14 @@ static int run_tracer_selftest(struct tracer *type) return 0; /* + * If a tracer registers early in boot up (before scheduling is + * initialized and such), then do not run its selftests yet. + * Instead, run it a little later in the boot process. + */ + if (!selftests_can_run) + return save_selftest(type); + + /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current * tracer to be this tracer. The tracer can then run some @@ -1489,6 +1518,47 @@ static int run_tracer_selftest(struct tracer *type) printk(KERN_CONT "PASSED\n"); return 0; } + +static __init int init_trace_selftests(void) +{ + struct trace_selftests *p, *n; + struct tracer *t, **last; + int ret; + + selftests_can_run = true; + + mutex_lock(&trace_types_lock); + + if (list_empty(&postponed_selftests)) + goto out; + + pr_info("Running postponed tracer tests:\n"); + + list_for_each_entry_safe(p, n, &postponed_selftests, list) { + ret = run_tracer_selftest(p->type); + /* If the test fails, then warn and remove from available_tracers */ + if (ret < 0) { + WARN(1, "tracer: %s failed selftest, disabling\n", + p->type->name); + last = &trace_types; + for (t = trace_types; t; t = t->next) { + if (t == p->type) { + *last = t->next; + break; + } + last = &t->next; + } + } + list_del(&p->list); + kfree(p); + } + + out: + mutex_unlock(&trace_types_lock); + + return 0; +} +early_initcall(init_trace_selftests); #else static inline int run_tracer_selftest(struct tracer *type) { @@ -1906,7 +1976,7 @@ static void __trace_find_cmdline(int pid, char comm[]) map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - strcpy(comm, get_saved_cmdlines(map)); + strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); else strcpy(comm, "<...>"); } @@ -1934,6 +2004,18 @@ void tracing_record_cmdline(struct task_struct *tsk) __this_cpu_write(trace_cmdline_save, false); } +/* + * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq + * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function + * simplifies those functions and keeps them in sync. + */ +enum print_line_t trace_handle_return(struct trace_seq *s) +{ + return trace_seq_has_overflowed(s) ? + TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; +} +EXPORT_SYMBOL_GPL(trace_handle_return); + void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc) @@ -3229,13 +3311,14 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (iter->started && cpumask_test_cpu(iter->cpu, iter->started)) + if (cpumask_available(iter->started) && + cpumask_test_cpu(iter->cpu, iter->started)) return; if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) return; - if (iter->started) + if (cpumask_available(iter->started)) cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ @@ -4129,6 +4212,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_EVENT_FORK) trace_event_follow_fork(tr, enabled); + if (mask == TRACE_ITER_FUNC_FORK) + ftrace_pid_follow_fork(tr, enabled); + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE @@ -4348,22 +4434,23 @@ static const char readme_msg[] = "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif -#ifdef CONFIG_UPROBE_EVENT +#ifdef CONFIG_UPROBE_EVENTS " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif -#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT) +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n" "\t -:[<group>/]<event>\n" -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" + "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n" #endif -#ifdef CONFIG_UPROBE_EVENT +#ifdef CONFIG_UPROBE_EVENTS "\t place: <path>:<offset>\n" #endif "\t args: <name>=fetcharg[:type]\n" @@ -5536,7 +5623,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; @@ -5969,6 +6055,7 @@ static int tracing_clock_open(struct inode *inode, struct file *file) struct ftrace_buffer_info { struct trace_iterator iter; void *spare; + unsigned int spare_cpu; unsigned int read; }; @@ -6298,9 +6385,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return -EBUSY; #endif - if (!info->spare) + if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); + info->spare_cpu = iter->cpu_file; + } if (!info->spare) return -ENOMEM; @@ -6360,7 +6449,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); if (info->spare) - ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); + ring_buffer_free_read_page(iter->trace_buffer->buffer, + info->spare_cpu, info->spare); kfree(info); mutex_unlock(&trace_types_lock); @@ -6371,6 +6461,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) struct buffer_ref { struct ring_buffer *buffer; void *page; + int cpu; int ref; }; @@ -6382,7 +6473,7 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, if (--ref->ref) return; - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); buf->private = 0; } @@ -6416,7 +6507,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) if (--ref->ref) return; - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); spd->partial[i].private = 0; } @@ -6434,7 +6525,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .pages = pages_def, .partial = partial_def, .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, }; @@ -6481,11 +6571,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, kfree(ref); break; } + ref->cpu = iter->cpu_file; r = ring_buffer_read_page(ref->buffer, &ref->page, len, iter->cpu_file, 1); if (r < 0) { - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, + ref->page); kfree(ref); break; } @@ -6656,43 +6748,89 @@ static const struct file_operations tracing_dyn_info_fops = { #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) static void -ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - tracing_snapshot(); + tracing_snapshot_instance(tr); } static void -ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - unsigned long *count = (long *)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; - if (!*count) - return; + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) { + + if (*count <= 0) + return; - if (*count != -1) (*count)--; + } - tracing_snapshot(); + tracing_snapshot_instance(tr); } static int ftrace_snapshot_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - long count = (long)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; seq_printf(m, "%ps:", (void *)ip); seq_puts(m, "snapshot"); - if (count == -1) - seq_puts(m, ":unlimited\n"); + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); else - seq_printf(m, ":count=%ld\n", count); + seq_puts(m, ":unlimited\n"); return 0; } +static int +ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); +} + static struct ftrace_probe_ops snapshot_probe_ops = { .func = ftrace_snapshot, .print = ftrace_snapshot_print, @@ -6701,10 +6839,12 @@ static struct ftrace_probe_ops snapshot_probe_ops = { static struct ftrace_probe_ops snapshot_count_probe_ops = { .func = ftrace_count_snapshot, .print = ftrace_snapshot_print, + .init = ftrace_snapshot_init, + .free = ftrace_snapshot_free, }; static int -ftrace_trace_snapshot_callback(struct ftrace_hash *hash, +ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -6718,10 +6858,8 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; - if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - return 0; - } + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); if (!param) goto out_reg; @@ -6740,11 +6878,13 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, return ret; out_reg: - ret = register_ftrace_function_probe(glob, ops, count); + ret = alloc_snapshot(tr); + if (ret < 0) + goto out; - if (ret >= 0) - alloc_snapshot(&global_trace); + ret = register_ftrace_function_probe(glob, tr, ops, count); + out: return ret < 0 ? ret : 0; } @@ -7353,6 +7493,8 @@ static int instance_mkdir(const char *name) goto out_free_tr; } + ftrace_init_trace_array(tr); + init_tracer_tracefs(tr, tr->dir); init_trace_flags_index(tr); __update_tracer_options(tr); @@ -7409,6 +7551,7 @@ static int instance_rmdir(const char *name) tracing_set_nop(tr); event_trace_del_tracer(tr); + ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); tracefs_remove_recursive(tr->dir); free_trace_buffers(tr); @@ -7503,7 +7646,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) ftrace_init_tracefs(tr, d_tracer); } -static struct vfsmount *trace_automount(void *ingore) +static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; @@ -7516,7 +7659,7 @@ static struct vfsmount *trace_automount(void *ingore) type = get_fs_type("tracefs"); if (!type) return NULL; - mnt = vfs_kern_mount(type, 0, "tracefs", NULL); + mnt = vfs_submount(mntpt, type, "tracefs", NULL); put_filesystem(type); if (IS_ERR(mnt)) return NULL; @@ -7972,6 +8115,9 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); + /* Function tracing may start here (via kernel command line) */ + init_function_trace(); + /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -8006,7 +8152,7 @@ out: return ret; } -void __init trace_init(void) +void __init early_trace_init(void) { if (tracepoint_printk) { tracepoint_print_iter = @@ -8017,6 +8163,10 @@ void __init trace_init(void) static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); +} + +void __init trace_init(void) +{ trace_event_init(); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1ea51ab53edf..291a1bca5748 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -262,6 +262,9 @@ struct trace_array { #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; +#ifdef CONFIG_DYNAMIC_FTRACE + struct list_head func_probes; +#endif /* function tracing enabled */ int function_enabled; #endif @@ -579,6 +582,8 @@ void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); int tracer_tracing_is_on(struct trace_array *tr); +void tracer_tracing_on(struct trace_array *tr); +void tracer_tracing_off(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, @@ -696,6 +701,9 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +void ftrace_init_trace_array(struct trace_array *tr); +#else +static inline void ftrace_init_trace_array(struct trace_array *tr) { } #endif #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); @@ -753,6 +761,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern char trace_find_mark(unsigned long long duration); +struct ftrace_hash { + unsigned long size_bits; + struct hlist_head *buckets; + unsigned long count; + struct rcu_head rcu; +}; + +struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); + +static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) +{ + return !hash || !hash->count; +} + /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -787,53 +810,50 @@ extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, int pc); - #ifdef CONFIG_DYNAMIC_FTRACE -/* TODO: make this variable */ -#define FTRACE_GRAPH_MAX_FUNCS 32 -extern int ftrace_graph_count; -extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; -extern int ftrace_graph_notrace_count; -extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS]; +extern struct ftrace_hash *ftrace_graph_hash; +extern struct ftrace_hash *ftrace_graph_notrace_hash; static inline int ftrace_graph_addr(unsigned long addr) { - int i; - - if (!ftrace_graph_count) - return 1; - - for (i = 0; i < ftrace_graph_count; i++) { - if (addr == ftrace_graph_funcs[i]) { - /* - * If no irqs are to be traced, but a set_graph_function - * is set, and called by an interrupt handler, we still - * want to trace it. - */ - if (in_irq()) - trace_recursion_set(TRACE_IRQ_BIT); - else - trace_recursion_clear(TRACE_IRQ_BIT); - return 1; - } + int ret = 0; + + preempt_disable_notrace(); + + if (ftrace_hash_empty(ftrace_graph_hash)) { + ret = 1; + goto out; } - return 0; + if (ftrace_lookup_ip(ftrace_graph_hash, addr)) { + /* + * If no irqs are to be traced, but a set_graph_function + * is set, and called by an interrupt handler, we still + * want to trace it. + */ + if (in_irq()) + trace_recursion_set(TRACE_IRQ_BIT); + else + trace_recursion_clear(TRACE_IRQ_BIT); + ret = 1; + } + +out: + preempt_enable_notrace(); + return ret; } static inline int ftrace_graph_notrace_addr(unsigned long addr) { - int i; + int ret = 0; - if (!ftrace_graph_notrace_count) - return 0; + preempt_disable_notrace(); - for (i = 0; i < ftrace_graph_notrace_count; i++) { - if (addr == ftrace_graph_notrace_funcs[i]) - return 1; - } + if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr)) + ret = 1; - return 0; + preempt_enable_notrace(); + return ret; } #else static inline int ftrace_graph_addr(unsigned long addr) @@ -868,6 +888,14 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER +struct ftrace_func_command { + struct list_head list; + char *name; + int (*func)(struct trace_array *tr, + struct ftrace_hash *hash, + char *func, char *cmd, + char *params, int enable); +}; extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { @@ -884,6 +912,9 @@ int using_ftrace_ops_list_func(void); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d_tracer); +void ftrace_clear_pids(struct trace_array *tr); +int init_function_trace(void); +void ftrace_pid_follow_fork(struct trace_array *tr, bool enable); #else static inline int ftrace_trace_task(struct trace_array *tr) { @@ -902,15 +933,71 @@ ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_clear_pids(struct trace_array *tr) { } +static inline int init_function_trace(void) { return 0; } +static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) + +struct ftrace_probe_ops { + void (*func)(unsigned long ip, + unsigned long parent_ip, + struct trace_array *tr, + struct ftrace_probe_ops *ops, + void *data); + int (*init)(struct ftrace_probe_ops *ops, + struct trace_array *tr, + unsigned long ip, void *init_data, + void **data); + void (*free)(struct ftrace_probe_ops *ops, + struct trace_array *tr, + unsigned long ip, void *data); + int (*print)(struct seq_file *m, + unsigned long ip, + struct ftrace_probe_ops *ops, + void *data); +}; + +struct ftrace_func_mapper; +typedef int (*ftrace_mapper_func)(void *data); + +struct ftrace_func_mapper *allocate_ftrace_func_mapper(void); +void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, + unsigned long ip); +int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, + unsigned long ip, void *data); +void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, + unsigned long ip); +void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, + ftrace_mapper_func free_func); + +extern int +register_ftrace_function_probe(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *ops, void *data); +extern int +unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *ops); + +int register_ftrace_command(struct ftrace_func_command *cmd); +int unregister_ftrace_command(struct ftrace_func_command *cmd); + void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent); void ftrace_destroy_filter_files(struct ftrace_ops *ops); #else +struct ftrace_func_command; + +static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) +{ + return -EINVAL; +} +static inline __init int unregister_ftrace_command(char *cmd_name) +{ + return -EINVAL; +} /* * The ops parameter passed in is usually undefined. * This must be a macro. @@ -975,11 +1062,13 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, #ifdef CONFIG_FUNCTION_TRACER # define FUNCTION_FLAGS \ - C(FUNCTION, "function-trace"), + C(FUNCTION, "function-trace"), \ + C(FUNC_FORK, "function-fork"), # define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION #else # define FUNCTION_FLAGS # define FUNCTION_DEFAULT_FLAGS 0UL +# define TRACE_ITER_FUNC_FORK 0UL #endif #ifdef CONFIG_STACKTRACE @@ -1300,7 +1389,8 @@ static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_STATIC_STRING || - field->filter_type == FILTER_PTR_STRING; + field->filter_type == FILTER_PTR_STRING || + field->filter_type == FILTER_COMM; } static inline bool is_function_field(struct ftrace_event_field *field) diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index e3b488825ae3..16a8cf02eee9 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -153,10 +153,18 @@ static int benchmark_event_kthread(void *arg) trace_do_benchmark(); /* - * We don't go to sleep, but let others - * run as well. + * We don't go to sleep, but let others run as well. + * This is bascially a "yield()" to let any task that + * wants to run, schedule in, but if the CPU is idle, + * we'll keep burning cycles. + * + * Note the _rcu_qs() version of cond_resched() will + * notify synchronize_rcu_tasks() that this thread has + * passed a quiescent state for rcu_tasks. Otherwise + * this thread will never voluntarily schedule which would + * block synchronize_rcu_tasks() indefinitely. */ - cond_resched(); + cond_resched_rcu_qs(); } return 0; @@ -175,9 +183,9 @@ int trace_benchmark_reg(void) bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); - if (!bm_event_thread) { + if (IS_ERR(bm_event_thread)) { pr_warning("trace benchmark failed to create kernel thread\n"); - return -ENOMEM; + return PTR_ERR(bm_event_thread); } return 0; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 75489de546b6..4d8fdf3184dc 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -27,7 +27,7 @@ static DEFINE_MUTEX(branch_tracing_mutex); static struct trace_array *branch_tracer; static void -probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) +probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { struct trace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; @@ -68,16 +68,17 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry = ring_buffer_event_data(event); /* Strip off the path, only save the file */ - p = f->file + strlen(f->file); - while (p >= f->file && *p != '/') + p = f->data.file + strlen(f->data.file); + while (p >= f->data.file && *p != '/') p--; p++; - strncpy(entry->func, f->func, TRACE_FUNC_SIZE); + strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE); strncpy(entry->file, p, TRACE_FILE_SIZE); entry->func[TRACE_FUNC_SIZE] = 0; entry->file[TRACE_FILE_SIZE] = 0; - entry->line = f->line; + entry->constant = f->constant; + entry->line = f->data.line; entry->correct = val == expect; if (!call_filter_check_discard(call, entry, buffer, event)) @@ -89,7 +90,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) } static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) { if (!branch_tracing_enabled) return; @@ -195,13 +196,19 @@ core_initcall(init_branch_tracer); #else static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) { } #endif /* CONFIG_BRANCH_TRACER */ -void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) +void ftrace_likely_update(struct ftrace_likely_data *f, int val, + int expect, int is_constant) { + /* A constant is always correct */ + if (is_constant) { + f->constant++; + val = expect; + } /* * I would love to have a trace point here instead, but the * trace point code is so inundated with unlikely and likely @@ -212,9 +219,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) /* FIXME: Make this atomic! */ if (val == expect) - f->correct++; + f->data.correct++; else - f->incorrect++; + f->data.incorrect++; } EXPORT_SYMBOL(ftrace_likely_update); @@ -245,29 +252,60 @@ static inline long get_incorrect_percent(struct ftrace_branch_data *p) return percent; } -static int branch_stat_show(struct seq_file *m, void *v) +static const char *branch_stat_process_file(struct ftrace_branch_data *p) { - struct ftrace_branch_data *p = v; const char *f; - long percent; /* Only print the file, not the path */ f = p->file + strlen(p->file); while (f >= p->file && *f != '/') f--; - f++; + return ++f; +} + +static void branch_stat_show(struct seq_file *m, + struct ftrace_branch_data *p, const char *f) +{ + long percent; /* * The miss is overlayed on correct, and hit on incorrect. */ percent = get_incorrect_percent(p); - seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); if (percent < 0) seq_puts(m, " X "); else seq_printf(m, "%3ld ", percent); + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); +} + +static int branch_stat_show_normal(struct seq_file *m, + struct ftrace_branch_data *p, const char *f) +{ + seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); + branch_stat_show(m, p, f); + return 0; +} + +static int annotate_branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_likely_data *p = v; + const char *f; + int l; + + f = branch_stat_process_file(&p->data); + + if (!p->constant) + return branch_stat_show_normal(m, &p->data, f); + + l = snprintf(NULL, 0, "/%lu", p->constant); + l = l > 8 ? 0 : 8 - l; + + seq_printf(m, "%8lu/%lu %*lu ", + p->data.correct, p->constant, l, p->data.incorrect); + branch_stat_show(m, &p->data, f); return 0; } @@ -279,7 +317,7 @@ static void *annotated_branch_stat_start(struct tracer_stat *trace) static void * annotated_branch_stat_next(void *v, int idx) { - struct ftrace_branch_data *p = v; + struct ftrace_likely_data *p = v; ++p; @@ -328,7 +366,7 @@ static struct tracer_stat annotated_branch_stats = { .stat_next = annotated_branch_stat_next, .stat_cmp = annotated_branch_stat_cmp, .stat_headers = annotated_branch_stat_headers, - .stat_show = branch_stat_show + .stat_show = annotate_branch_stat_show }; __init static int init_annotated_branch_stats(void) @@ -379,12 +417,21 @@ all_branch_stat_next(void *v, int idx) return p; } +static int all_branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_branch_data *p = v; + const char *f; + + f = branch_stat_process_file(p); + return branch_stat_show_normal(m, p, f); +} + static struct tracer_stat all_branch_stats = { .name = "branch_all", .stat_start = all_branch_stat_start, .stat_next = all_branch_stat_next, .stat_headers = all_branch_stat_headers, - .stat_show = branch_stat_show + .stat_show = all_branch_stat_show }; __init static int all_annotated_branch_stats(void) diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 0f06532a755b..5fdc779f411d 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/ktime.h> #include <linux/trace_clock.h> diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index eb7396b7e7c3..adcdbbeae010 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -328,11 +328,13 @@ FTRACE_ENTRY(branch, trace_branch, __array( char, func, TRACE_FUNC_SIZE+1 ) __array( char, file, TRACE_FILE_SIZE+1 ) __field( char, correct ) + __field( char, constant ) ), - F_printk("%u:%s:%s (%u)", + F_printk("%u:%s:%s (%u)%s", __entry->line, - __entry->func, __entry->file, __entry->correct), + __entry->func, __entry->file, __entry->correct, + __entry->constant ? " CONSTANT" : ""), FILTER_OTHER ); @@ -346,14 +348,14 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __field( u64, duration ) __field( u64, outer_duration ) __field( u64, nmi_total_ts ) - __field_struct( struct timespec, timestamp ) - __field_desc( long, timestamp, tv_sec ) + __field_struct( struct timespec64, timestamp ) + __field_desc( s64, timestamp, tv_sec ) __field_desc( long, timestamp, tv_nsec ) __field( unsigned int, nmi_count ) __field( unsigned int, seqnum ) ), - F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", + F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", __entry->seqnum, __entry->tv_sec, __entry->tv_nsec, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 93116549a284..e7973e10398c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2460,15 +2460,8 @@ struct event_probe_data { bool enable; }; -static void -event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +static void update_event_probe(struct event_probe_data *data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; - - if (!data) - return; - if (data->enable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); else @@ -2476,77 +2469,141 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) } static void -event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +event_enable_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; - if (!data) + pdata = ftrace_func_mapper_find_ip(mapper, ip); + if (!pdata || !*pdata) + return; + + edata = *pdata; + update_event_probe(edata); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; + + pdata = ftrace_func_mapper_find_ip(mapper, ip); + if (!pdata || !*pdata) return; - if (!data->count) + edata = *pdata; + + if (!edata->count) return; /* Skip if the event is in a state we want to switch to */ - if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) + if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; - if (data->count != -1) - (data->count)--; + if (edata->count != -1) + (edata->count)--; - event_enable_probe(ip, parent_ip, _data); + update_event_probe(edata); } static int event_enable_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *_data) + struct ftrace_probe_ops *ops, void *data) { - struct event_probe_data *data = _data; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; + + pdata = ftrace_func_mapper_find_ip(mapper, ip); + + if (WARN_ON_ONCE(!pdata || !*pdata)) + return 0; + + edata = *pdata; seq_printf(m, "%ps:", (void *)ip); seq_printf(m, "%s:%s:%s", - data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, - data->file->event_call->class->system, - trace_event_name(data->file->event_call)); + edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + edata->file->event_call->class->system, + trace_event_name(edata->file->event_call)); - if (data->count == -1) + if (edata->count == -1) seq_puts(m, ":unlimited\n"); else - seq_printf(m, ":count=%ld\n", data->count); + seq_printf(m, ":count=%ld\n", edata->count); return 0; } static int -event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, - void **_data) +event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct ftrace_func_mapper *mapper = *data; + struct event_probe_data *edata = init_data; + int ret; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENODEV; + *data = mapper; + } + + ret = ftrace_func_mapper_add_ip(mapper, ip, edata); + if (ret < 0) + return ret; + + edata->ref++; - data->ref++; + return 0; +} + +static int free_probe_data(void *data) +{ + struct event_probe_data *edata = data; + + edata->ref--; + if (!edata->ref) { + /* Remove the SOFT_MODE flag */ + __ftrace_event_enable_disable(edata->file, 0, 1); + module_put(edata->file->event_call->mod); + kfree(edata); + } return 0; } static void -event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, - void **_data) +event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; - if (WARN_ON_ONCE(data->ref <= 0)) + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, free_probe_data); return; - - data->ref--; - if (!data->ref) { - /* Remove the SOFT_MODE flag */ - __ftrace_event_enable_disable(data->file, 0, 1); - module_put(data->file->event_call->mod); - kfree(data); } - *pdata = NULL; + + edata = ftrace_func_mapper_remove_ip(mapper, ip); + + if (WARN_ON_ONCE(!edata)) + return; + + if (WARN_ON_ONCE(edata->ref <= 0)) + return; + + free_probe_data(edata); } static struct ftrace_probe_ops event_enable_probe_ops = { @@ -2578,10 +2635,9 @@ static struct ftrace_probe_ops event_disable_count_probe_ops = { }; static int -event_enable_func(struct ftrace_hash *hash, +event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enabled) { - struct trace_array *tr = top_trace_array(); struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; @@ -2619,12 +2675,12 @@ event_enable_func(struct ftrace_hash *hash, ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - ret = 0; + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); goto out; } ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out; @@ -2661,7 +2717,8 @@ event_enable_func(struct ftrace_hash *hash, ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; - ret = register_ftrace_function_probe(glob, ops, data); + + ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f3a960ed75a1..1c21d0e2a145 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -19,6 +19,7 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/stacktrace.h> +#include <linux/rculist.h> #include "tracing_map.h" #include "trace.h" diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 6721a1e89f39..f2ac9d44f6c4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -22,6 +22,7 @@ #include <linux/ctype.h> #include <linux/mutex.h> #include <linux/slab.h> +#include <linux/rculist.h> #include "trace.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 0efa00d80623..a3bddbfd0874 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -267,10 +267,14 @@ static struct tracer function_trace __tracer_data = }; #ifdef CONFIG_DYNAMIC_FTRACE -static void update_traceon_count(void **data, bool on) +static void update_traceon_count(struct ftrace_probe_ops *ops, + unsigned long ip, + struct trace_array *tr, bool on, + void *data) { - long *count = (long *)data; - long old_count = *count; + struct ftrace_func_mapper *mapper = data; + long *count; + long old_count; /* * Tracing gets disabled (or enabled) once per count. @@ -301,23 +305,22 @@ static void update_traceon_count(void **data, bool on) * setting the tracing_on file. But we currently don't care * about that. */ - if (!old_count) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + old_count = *count; + + if (old_count <= 0) return; /* Make sure we see count before checking tracing state */ smp_rmb(); - if (on == !!tracing_is_on()) + if (on == !!tracer_tracing_is_on(tr)) return; if (on) - tracing_on(); + tracer_tracing_on(tr); else - tracing_off(); - - /* unlimited? */ - if (old_count == -1) - return; + tracer_tracing_off(tr); /* Make sure tracing state is visible before updating count */ smp_wmb(); @@ -326,33 +329,41 @@ static void update_traceon_count(void **data, bool on) } static void -ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - update_traceon_count(data, 1); + update_traceon_count(ops, ip, tr, 1, data); } static void -ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - update_traceon_count(data, 0); + update_traceon_count(ops, ip, tr, 0, data); } static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (tracing_is_on()) + if (tracer_tracing_is_on(tr)) return; - tracing_on(); + tracer_tracing_on(tr); } static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (!tracing_is_on()) + if (!tracer_tracing_is_on(tr)) return; - tracing_off(); + tracer_tracing_off(tr); } /* @@ -364,144 +375,218 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) */ #define STACK_SKIP 4 +static __always_inline void trace_stack(struct trace_array *tr) +{ + unsigned long flags; + int pc; + + local_save_flags(flags); + pc = preempt_count(); + + __trace_stack(tr, flags, STACK_SKIP, pc); +} + static void -ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - trace_dump_stack(STACK_SKIP); + trace_stack(tr); } static void -ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - long *count = (long *)data; + struct ftrace_func_mapper *mapper = data; + long *count; long old_count; long new_count; + if (!tracing_is_on()) + return; + + /* unlimited? */ + if (!mapper) { + trace_stack(tr); + return; + } + + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + /* * Stack traces should only execute the number of times the * user specified in the counter. */ do { - - if (!tracing_is_on()) - return; - old_count = *count; if (!old_count) return; - /* unlimited? */ - if (old_count == -1) { - trace_dump_stack(STACK_SKIP); - return; - } - new_count = old_count - 1; new_count = cmpxchg(count, old_count, new_count); if (new_count == old_count) - trace_dump_stack(STACK_SKIP); + trace_stack(tr); + + if (!tracing_is_on()) + return; } while (new_count != old_count); } -static int update_count(void **data) +static int update_count(struct ftrace_probe_ops *ops, unsigned long ip, + void *data) { - unsigned long *count = (long *)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; - if (!*count) - return 0; + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); - if (*count != -1) + if (count) { + if (*count <= 0) + return 0; (*count)--; + } return 1; } static void -ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (update_count(data)) + if (update_count(ops, ip, data)) ftrace_dump(DUMP_ALL); } /* Only dump the current CPU buffer. */ static void -ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (update_count(data)) + if (update_count(ops, ip, data)) ftrace_dump(DUMP_ORIG); } static int ftrace_probe_print(const char *name, struct seq_file *m, - unsigned long ip, void *data) + unsigned long ip, struct ftrace_probe_ops *ops, + void *data) { - long count = (long)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; seq_printf(m, "%ps:%s", (void *)ip, name); - if (count == -1) - seq_puts(m, ":unlimited\n"); + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); else - seq_printf(m, ":count=%ld\n", count); + seq_puts(m, ":unlimited\n"); return 0; } static int ftrace_traceon_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) + struct ftrace_probe_ops *ops, + void *data) { - return ftrace_probe_print("traceon", m, ip, data); + return ftrace_probe_print("traceon", m, ip, ops, data); } static int ftrace_traceoff_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("traceoff", m, ip, data); + return ftrace_probe_print("traceoff", m, ip, ops, data); } static int ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("stacktrace", m, ip, data); + return ftrace_probe_print("stacktrace", m, ip, ops, data); } static int ftrace_dump_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("dump", m, ip, data); + return ftrace_probe_print("dump", m, ip, ops, data); } static int ftrace_cpudump_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("cpudump", m, ip, data); + return ftrace_probe_print("cpudump", m, ip, ops, data); +} + + +static int +ftrace_count_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_count_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); } static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops traceoff_count_probe_ops = { .func = ftrace_traceoff_count, .print = ftrace_traceoff_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops stacktrace_count_probe_ops = { .func = ftrace_stacktrace_count, .print = ftrace_stacktrace_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops dump_probe_ops = { .func = ftrace_dump_probe, .print = ftrace_dump_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops cpudump_probe_ops = { @@ -525,7 +610,8 @@ static struct ftrace_probe_ops stacktrace_probe_ops = { }; static int -ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, +ftrace_trace_probe_callback(struct trace_array *tr, + struct ftrace_probe_ops *ops, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { @@ -537,10 +623,8 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, if (!enable) return -EINVAL; - if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - return 0; - } + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); if (!param) goto out_reg; @@ -559,13 +643,13 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, return ret; out_reg: - ret = register_ftrace_function_probe(glob, ops, count); + ret = register_ftrace_function_probe(glob, tr, ops, count); return ret < 0 ? ret : 0; } static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, +ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -576,24 +660,24 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, else ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, param, enable); } static int -ftrace_stacktrace_callback(struct ftrace_hash *hash, +ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, param, enable); } static int -ftrace_dump_callback(struct ftrace_hash *hash, +ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -601,12 +685,12 @@ ftrace_dump_callback(struct ftrace_hash *hash, ops = &dump_probe_ops; /* Only dump once. */ - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, "1", enable); } static int -ftrace_cpudump_callback(struct ftrace_hash *hash, +ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -614,7 +698,7 @@ ftrace_cpudump_callback(struct ftrace_hash *hash, ops = &cpudump_probe_ops; /* Only dump once. */ - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, "1", enable); } @@ -687,9 +771,8 @@ static inline int init_func_cmd_traceon(void) } #endif /* CONFIG_DYNAMIC_FTRACE */ -static __init int init_function_trace(void) +__init int init_function_trace(void) { init_func_cmd_traceon(); return register_tracer(&function_trace); } -core_initcall(init_function_trace); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index af344a1bf0d0..d7c8e4ec3d9d 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -44,6 +44,7 @@ #include <linux/uaccess.h> #include <linux/cpumask.h> #include <linux/delay.h> +#include <linux/sched/clock.h> #include "trace.h" static struct trace_array *hwlat_trace; @@ -78,12 +79,12 @@ static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; /* Individual latency samples are stored here when detected. */ struct hwlat_sample { - u64 seqnum; /* unique sequence */ - u64 duration; /* delta */ - u64 outer_duration; /* delta (outer loop) */ - u64 nmi_total_ts; /* Total time spent in NMIs */ - struct timespec timestamp; /* wall time */ - int nmi_count; /* # NMIs during this sample */ + u64 seqnum; /* unique sequence */ + u64 duration; /* delta */ + u64 outer_duration; /* delta (outer loop) */ + u64 nmi_total_ts; /* Total time spent in NMIs */ + struct timespec64 timestamp; /* wall time */ + int nmi_count; /* # NMIs during this sample */ }; /* keep the global state somewhere. */ @@ -249,7 +250,7 @@ static int get_sample(void) s.seqnum = hwlat_data.count; s.duration = sample; s.outer_duration = outer_sample; - s.timestamp = CURRENT_TIME; + ktime_get_real_ts64(&s.timestamp); s.nmi_total_ts = nmi_total_ts; s.nmi_count = nmi_count; trace_hwlat_sample(&s); @@ -266,24 +267,13 @@ out: static struct cpumask save_cpumask; static bool disable_migrate; -static void move_to_next_cpu(bool initmask) +static void move_to_next_cpu(void) { - static struct cpumask *current_mask; + struct cpumask *current_mask = &save_cpumask; int next_cpu; if (disable_migrate) return; - - /* Just pick the first CPU on first iteration */ - if (initmask) { - current_mask = &save_cpumask; - get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); - put_online_cpus(); - next_cpu = cpumask_first(current_mask); - goto set_affinity; - } - /* * If for some reason the user modifies the CPU affinity * of this thread, than stop migrating for the duration @@ -300,7 +290,6 @@ static void move_to_next_cpu(bool initmask) if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(current_mask); - set_affinity: if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ goto disable; @@ -322,20 +311,15 @@ static void move_to_next_cpu(bool initmask) * need to ensure nothing else might be running (and thus preempting). * Obviously this should never be used in production environments. * - * Currently this runs on which ever CPU it was scheduled on, but most - * real-world hardware latency situations occur across several CPUs, - * but we might later generalize this if we find there are any actualy - * systems with alternate SMI delivery or other hardware latencies. + * Executes one loop interaction on each CPU in tracing_cpumask sysfs file. */ static int kthread_fn(void *data) { u64 interval; - bool initmask = true; while (!kthread_should_stop()) { - move_to_next_cpu(initmask); - initmask = false; + move_to_next_cpu(); local_irq_disable(); get_sample(); @@ -366,13 +350,27 @@ static int kthread_fn(void *data) */ static int start_kthread(struct trace_array *tr) { + struct cpumask *current_mask = &save_cpumask; struct task_struct *kthread; + int next_cpu; + + /* Just pick the first CPU on first iteration */ + current_mask = &save_cpumask; + get_online_cpus(); + cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + put_online_cpus(); + next_cpu = cpumask_first(current_mask); kthread = kthread_create(kthread_fn, NULL, "hwlatd"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); return -ENOMEM; } + + cpumask_clear(current_mask); + cpumask_set_cpu(next_cpu, current_mask); + sched_setaffinity(kthread->pid, current_mask); + hwlat_kthread = kthread; wake_up_process(kthread); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7ad9e53ad174..8485f6738a87 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -16,13 +16,16 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "trace_kprobe: " fmt #include <linux/module.h> #include <linux/uaccess.h> +#include <linux/rculist.h> #include "trace_probe.h" #define KPROBE_EVENT_SYSTEM "kprobes" +#define KRETPROBE_MAXACTIVE_MAX 4096 /** * Kprobe event core functions @@ -280,6 +283,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, void *addr, const char *symbol, unsigned long offs, + int maxactive, int nargs, bool is_return) { struct trace_kprobe *tk; @@ -307,6 +311,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, else tk->rp.kp.pre_handler = kprobe_dispatcher; + tk->rp.maxactive = maxactive; + if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; @@ -596,8 +602,10 @@ static int create_trace_kprobe(int argc, char **argv) { /* * Argument syntax: - * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] - * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * - Add kprobe: + * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: + * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] * Fetch args: * $retval : fetch return value * $stack : fetch stack address @@ -617,6 +625,7 @@ static int create_trace_kprobe(int argc, char **argv) int i, ret = 0; bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; + int maxactive = 0; char *arg; unsigned long offset = 0; void *addr = NULL; @@ -635,8 +644,28 @@ static int create_trace_kprobe(int argc, char **argv) return -EINVAL; } - if (argv[0][1] == ':') { - event = &argv[0][2]; + event = strchr(&argv[0][1], ':'); + if (event) { + event[0] = '\0'; + event++; + } + if (is_return && isdigit(argv[0][1])) { + ret = kstrtouint(&argv[0][1], 0, &maxactive); + if (ret) { + pr_info("Failed to parse maxactive.\n"); + return ret; + } + /* kretprobes instances are iterated over via a list. The + * maximum should stay reasonable. + */ + if (maxactive > KRETPROBE_MAXACTIVE_MAX) { + pr_info("Maxactive is too big (%d > %d).\n", + maxactive, KRETPROBE_MAXACTIVE_MAX); + return -E2BIG; + } + } + + if (event) { if (strchr(event, '/')) { group = event; event = strchr(group, '/') + 1; @@ -679,10 +708,6 @@ static int create_trace_kprobe(int argc, char **argv) return -EINVAL; } if (isdigit(argv[1][0])) { - if (is_return) { - pr_info("Return probe point must be a symbol.\n"); - return -EINVAL; - } /* an address specified */ ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { @@ -698,8 +723,9 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Failed to parse symbol.\n"); return ret; } - if (offset && is_return) { - pr_info("Return probe must be used without offset.\n"); + if (offset && is_return && + !function_offset_within_entry(NULL, symbol, offset)) { + pr_info("Given offset is not valid for return probe.\n"); return -EINVAL; } } @@ -716,8 +742,8 @@ static int create_trace_kprobe(int argc, char **argv) is_return ? 'r' : 'p', addr); event = buf; } - tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, - is_return); + tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, + argc, is_return); if (IS_ERR(tk)) { pr_info("Failed to allocate trace_probe.(%d)\n", (int)PTR_ERR(tk)); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5d33a7352919..08f9bab8089e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -4,10 +4,11 @@ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * */ - #include <linux/module.h> #include <linux/mutex.h> #include <linux/ftrace.h> +#include <linux/sched/clock.h> +#include <linux/sched/mm.h> #include "trace_output.h" @@ -124,6 +125,44 @@ EXPORT_SYMBOL(trace_print_symbols_seq); #if BITS_PER_LONG == 32 const char * +trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, + unsigned long long flags, + const struct trace_print_flags_u64 *flag_array) +{ + unsigned long long mask; + const char *str; + const char *ret = trace_seq_buffer_ptr(p); + int i, first = 1; + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (!first && delim) + trace_seq_puts(p, delim); + else + first = 0; + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (!first && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%llx", flags); + } + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(trace_print_flags_seq_u64); + +const char * trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array) { @@ -162,15 +201,27 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, } EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); +/** + * trace_print_hex_seq - print buffer as hex sequence + * @p: trace seq struct to write to + * @buf: The buffer to print + * @buf_len: Length of @buf in bytes + * @concatenate: Print @buf as single hex string or with spacing + * + * Prints the passed buffer as a hex sequence either as a whole, + * single hex string if @concatenate is true or with spacing after + * each byte in case @concatenate is false. + */ const char * -trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) +trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, + bool concatenate) { int i; const char *ret = trace_seq_buffer_ptr(p); for (i = 0; i < buf_len; i++) - trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); - + trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ", + buf[i]); trace_seq_putc(p, 0); return ret; @@ -1109,11 +1160,11 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld", field->seqnum, field->duration, field->outer_duration, - field->timestamp.tv_sec, + (long long)field->timestamp.tv_sec, field->timestamp.tv_nsec); if (field->nmi_count) { @@ -1143,10 +1194,10 @@ trace_hwlat_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", + trace_seq_printf(s, "%llu %lld %lld %09ld %u\n", field->duration, field->outer_duration, - field->timestamp.tv_sec, + (long long)field->timestamp.tv_sec, field->timestamp.tv_nsec, field->seqnum); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8c0553d9afd3..52478f033f88 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -21,6 +21,7 @@ * Copyright (C) IBM Corporation, 2010-2011 * Author: Srikar Dronamraju */ +#define pr_fmt(fmt) "trace_probe: " fmt #include "trace_probe.h" @@ -647,7 +648,7 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(int, char **)) { - char *kbuf, *tmp; + char *kbuf, *buf, *tmp; int ret = 0; size_t done = 0; size_t size; @@ -667,27 +668,38 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, goto out; } kbuf[size] = '\0'; - tmp = strchr(kbuf, '\n'); + buf = kbuf; + do { + tmp = strchr(buf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - buf + 1; + } else { + size = strlen(buf); + if (done + size < count) { + if (buf != kbuf) + break; + /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE - 2); + ret = -EINVAL; + goto out; + } + } + done += size; - if (tmp) { - *tmp = '\0'; - size = tmp - kbuf + 1; - } else if (done + size < count) { - pr_warn("Line length is too long: Should be less than %d\n", - WRITE_BUFSIZE); - ret = -EINVAL; - goto out; - } - done += size; - /* Remove comments */ - tmp = strchr(kbuf, '#'); + /* Remove comments */ + tmp = strchr(buf, '#'); - if (tmp) - *tmp = '\0'; + if (tmp) + *tmp = '\0'; - ret = traceprobe_command(kbuf, createfn); - if (ret) - goto out; + ret = traceprobe_command(buf, createfn); + if (ret) + goto out; + buf += size; + + } while (done < count); } ret = done; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 0c0ae54d44c6..903273c93e61 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -248,7 +248,7 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ #define FETCH_TYPE_STRING 0 #define FETCH_TYPE_STRSIZE 1 -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); @@ -278,7 +278,7 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } -#endif /* CONFIG_KPROBE_EVENT */ +#endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { struct fetch_param fetch; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b0f86ea77881..cb917cebae29 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1,5 +1,6 @@ /* Include in trace.c */ +#include <uapi/linux/sched/types.h> #include <linux/stringify.h> #include <linux/kthread.h> #include <linux/delay.h> diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 2a1abbaca10e..76aa04d4c925 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -2,6 +2,7 @@ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> * */ +#include <linux/sched/task_stack.h> #include <linux/stacktrace.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> @@ -34,7 +35,7 @@ unsigned long stack_trace_max_size; arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -static DEFINE_PER_CPU(int, trace_active); +DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; @@ -64,7 +65,7 @@ void stack_trace_print(void) } /* - * When arch-specific code overides this function, the following + * When arch-specific code overrides this function, the following * data should be filled up, assuming stack_trace_max_lock is held to * prevent concurrent updates. * stack_trace_index[] @@ -95,6 +96,14 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; + /* + * There's a slight chance that we are tracing inside the + * RCU infrastructure, and rcu_irq_enter() will not work + * as expected. + */ + if (unlikely(rcu_irq_enter_disabled())) + return; + local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); @@ -206,13 +215,12 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { unsigned long stack; - int cpu; preempt_disable_notrace(); - cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ - if (per_cpu(trace_active, cpu)++ != 0) + __this_cpu_inc(disable_stack_tracer); + if (__this_cpu_read(disable_stack_tracer) != 1) goto out; ip += MCOUNT_INSN_SIZE; @@ -220,7 +228,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, check_stack(ip, &stack); out: - per_cpu(trace_active, cpu)--; + __this_cpu_dec(disable_stack_tracer); /* prevent recursion in schedule */ preempt_enable_notrace(); } @@ -252,7 +260,6 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, long *ptr = filp->private_data; unsigned long val, flags; int ret; - int cpu; ret = kstrtoul_from_user(ubuf, count, 10, &val); if (ret) @@ -263,16 +270,15 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, /* * In case we trace inside arch_spin_lock() or after (NMI), * we will cause circular lock, so we also need to increase - * the percpu trace_active here. + * the percpu disable_stack_tracer here. */ - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)++; + __this_cpu_inc(disable_stack_tracer); arch_spin_lock(&stack_trace_max_lock); *ptr = val; arch_spin_unlock(&stack_trace_max_lock); - per_cpu(trace_active, cpu)--; + __this_cpu_dec(disable_stack_tracer); local_irq_restore(flags); return count; @@ -306,12 +312,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - int cpu; - local_irq_disable(); - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)++; + __this_cpu_inc(disable_stack_tracer); arch_spin_lock(&stack_trace_max_lock); @@ -323,12 +326,9 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { - int cpu; - arch_spin_unlock(&stack_trace_max_lock); - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)--; + __this_cpu_dec(disable_stack_tracer); local_irq_enable(); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0913693caf6e..a7581fec9681 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -17,12 +17,14 @@ * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> */ +#define pr_fmt(fmt) "trace_kprobe: " fmt #include <linux/module.h> #include <linux/uaccess.h> #include <linux/uprobes.h> #include <linux/namei.h> #include <linux/string.h> +#include <linux/rculist.h> #include "trace_probe.h" @@ -431,7 +433,8 @@ static int create_trace_uprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - arg = strchr(argv[1], ':'); + /* Find the last occurrence, in case the path contains ':' too. */ + arg = strrchr(argv[1], ':'); if (!arg) { ret = -EINVAL; goto fail_address_parse; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1f9a31f934a4..685c50ae6300 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -24,7 +24,8 @@ #include <linux/tracepoint.h> #include <linux/err.h> #include <linux/slab.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> #include <linux/static_key.h> extern struct tracepoint * const __start___tracepoints_ptrs[]; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index f8e26ab963ed..370724b45391 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -17,7 +17,9 @@ */ #include <linux/kernel.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/mm.h> +#include <linux/sched/cputime.h> #include <linux/tsacct_kern.h> #include <linux/acct.h> #include <linux/jiffies.h> @@ -31,7 +33,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, struct taskstats *stats, struct task_struct *tsk) { const struct cred *tcred; - cputime_t utime, stime, utimescaled, stimescaled; + u64 utime, stime, utimescaled, stimescaled; u64 delta; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); @@ -67,12 +69,12 @@ void bacct_add_tsk(struct user_namespace *user_ns, rcu_read_unlock(); task_cputime(tsk, &utime, &stime); - stats->ac_utime = cputime_to_usecs(utime); - stats->ac_stime = cputime_to_usecs(stime); + stats->ac_utime = div_u64(utime, NSEC_PER_USEC); + stats->ac_stime = div_u64(stime, NSEC_PER_USEC); task_cputime_scaled(tsk, &utimescaled, &stimescaled); - stats->ac_utimescaled = cputime_to_usecs(utimescaled); - stats->ac_stimescaled = cputime_to_usecs(stimescaled); + stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC); + stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC); stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; @@ -123,18 +125,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) #undef MB static void __acct_update_integrals(struct task_struct *tsk, - cputime_t utime, cputime_t stime) + u64 utime, u64 stime) { - cputime_t time, dtime; - u64 delta; + u64 time, delta; if (!likely(tsk->mm)) return; time = stime + utime; - dtime = time - tsk->acct_timexpd; - /* Avoid division: cputime_t is often in nanoseconds already. */ - delta = cputime_to_nsecs(dtime); + delta = time - tsk->acct_timexpd; if (delta < TICK_NSEC) return; @@ -155,7 +154,7 @@ static void __acct_update_integrals(struct task_struct *tsk, */ void acct_update_integrals(struct task_struct *tsk) { - cputime_t utime, stime; + u64 utime, stime; unsigned long flags; local_irq_save(flags); diff --git a/kernel/ucount.c b/kernel/ucount.c index 95c6336fc2b3..b4eeee03934f 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -8,6 +8,7 @@ #include <linux/stat.h> #include <linux/sysctl.h> #include <linux/slab.h> +#include <linux/cred.h> #include <linux/hash.h> #include <linux/user_namespace.h> @@ -57,7 +58,7 @@ static struct ctl_table_root set_root = { static int zero = 0; static int int_max = INT_MAX; -#define UCOUNT_ENTRY(name) \ +#define UCOUNT_ENTRY(name) \ { \ .procname = name, \ .maxlen = sizeof(int), \ @@ -74,6 +75,10 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), +#ifdef CONFIG_INOTIFY_USER + UCOUNT_ENTRY("max_inotify_instances"), + UCOUNT_ENTRY("max_inotify_watches"), +#endif { } }; #endif /* CONFIG_SYSCTL */ @@ -139,7 +144,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) new->ns = ns; new->uid = uid; - atomic_set(&new->count, 0); + new->count = 0; spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -150,8 +155,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) ucounts = new; } } - if (!atomic_add_unless(&ucounts->count, 1, INT_MAX)) + if (ucounts->count == INT_MAX) ucounts = NULL; + else + ucounts->count += 1; spin_unlock_irq(&ucounts_lock); return ucounts; } @@ -160,13 +167,15 @@ static void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - if (atomic_dec_and_test(&ucounts->count)) { - spin_lock_irqsave(&ucounts_lock, flags); + spin_lock_irqsave(&ucounts_lock, flags); + ucounts->count -= 1; + if (!ucounts->count) hlist_del_init(&ucounts->node); - spin_unlock_irqrestore(&ucounts_lock, flags); + else + ucounts = NULL; + spin_unlock_irqrestore(&ucounts_lock, flags); - kfree(ucounts); - } + kfree(ucounts); } static inline bool atomic_inc_below(atomic_t *v, int u) diff --git a/kernel/uid16.c b/kernel/uid16.c index 71645ae9303a..5c2dc5b2bf4f 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/highuid.h> #include <linux/security.h> +#include <linux/cred.h> #include <linux/syscalls.h> #include <linux/uaccess.h> diff --git a/kernel/user.c b/kernel/user.c index b069ccbfb0b0..00281add65b2 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/bitops.h> #include <linux/key.h> +#include <linux/sched/user.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 86b7854fec8e..2f735cbe05e8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -8,6 +8,7 @@ #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> +#include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> diff --git a/kernel/utsname.c b/kernel/utsname.c index 6976cd47dcf6..913fe4336d2b 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -14,8 +14,10 @@ #include <linux/utsname.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/cred.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> +#include <linux/sched/task.h> static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) { diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index c8eac43267e9..233cd8fc6910 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -14,6 +14,7 @@ #include <linux/utsname.h> #include <linux/sysctl.h> #include <linux/wait.h> +#include <linux/rwsem.h> #ifdef CONFIG_PROC_SYSCTL diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 63177be0159e..03e0b69bb5bf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -19,8 +19,11 @@ #include <linux/sysctl.h> #include <linux/smpboot.h> #include <linux/sched/rt.h> +#include <uapi/linux/sched/types.h> #include <linux/tick.h> #include <linux/workqueue.h> +#include <linux/sched/clock.h> +#include <linux/sched/debug.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 12b8dd640786..54a427d1f344 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -13,6 +13,8 @@ #include <linux/nmi.h> #include <linux/module.h> +#include <linux/sched/debug.h> + #include <asm/irq_regs.h> #include <linux/perf_event.h> @@ -137,12 +139,14 @@ static void watchdog_overflow_callback(struct perf_event *event, * Reduce the watchdog noise by only printing messages * that are different from what cpu0 displayed. */ -static unsigned long cpu0_err; +static unsigned long firstcpu_err; +static atomic_t watchdog_cpus; int watchdog_nmi_enable(unsigned int cpu) { struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); + int firstcpu = 0; /* nothing to do if the hard lockup detector is disabled */ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) @@ -156,19 +160,22 @@ int watchdog_nmi_enable(unsigned int cpu) if (event != NULL) goto out_enable; + if (atomic_inc_return(&watchdog_cpus) == 1) + firstcpu = 1; + wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - /* save cpu0 error for future comparision */ - if (cpu == 0 && IS_ERR(event)) - cpu0_err = PTR_ERR(event); + /* save the first cpu's error for future comparision */ + if (firstcpu && IS_ERR(event)) + firstcpu_err = PTR_ERR(event); if (!IS_ERR(event)) { - /* only print for cpu0 or different than cpu0 */ - if (cpu == 0 || cpu0_err) + /* only print for the first cpu initialized */ + if (firstcpu || firstcpu_err) pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); goto out_save; } @@ -186,7 +193,7 @@ int watchdog_nmi_enable(unsigned int cpu) smp_mb__after_atomic(); /* skip displaying the same error again */ - if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) return PTR_ERR(event); /* vary the KERN level based on the returned errno */ @@ -222,9 +229,9 @@ void watchdog_nmi_disable(unsigned int cpu) /* should be in cleanup, but blocks oprofile */ perf_event_release_kernel(event); - } - if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ - cpu0_err = 0; + if (atomic_dec_and_test(&watchdog_cpus)) + firstcpu_err = 0; } } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1d9fb6543a66..c74bf39ef764 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1507,6 +1507,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + WARN_ON_ONCE(!wq); WARN_ON_ONCE(timer->function != delayed_work_timer_fn || timer->data != (unsigned long)dwork); WARN_ON_ONCE(timer_pending(timer)); @@ -1523,8 +1524,6 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, return; } - timer_stats_timer_set_start_info(&dwork->timer); - dwork->wq = wq; dwork->cpu = cpu; timer->expires = jiffies + delay; @@ -3210,9 +3209,8 @@ static int init_worker_pool(struct worker_pool *pool) INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); - init_timer_deferrable(&pool->idle_timer); - pool->idle_timer.function = idle_worker_timeout; - pool->idle_timer.data = (unsigned long)pool; + setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout, + (unsigned long)pool); setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); @@ -4736,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); + +/** + * work_on_cpu_safe - run a function in thread context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function argument + * + * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold + * any locks which would prevent @fn from completing. + * + * Return: The value @fn returns. + */ +long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +{ + long ret = -ENODEV; + + get_online_cpus(); + if (cpu_online(cpu)) + ret = work_on_cpu(cpu, fn, arg); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu_safe); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER |