diff options
Diffstat (limited to 'kernel')
109 files changed, 2248 insertions, 1663 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index eb26e12c6c2a..12c679f769c6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o @@ -115,8 +116,6 @@ obj-$(CONFIG_HAS_IOMEM) += memremap.o $(obj)/configs.o: $(obj)/config_data.h -# config_data.h contains the same information as ikconfig.h but gzipped. -# Info from config_data can be extracted from /proc/config* targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) diff --git a/kernel/audit.c b/kernel/audit.c index 67b9fbd871be..6e399bb69d7c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -107,7 +107,6 @@ static u32 audit_rate_limit; * When set to zero, this means unlimited. */ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) -static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ @@ -138,11 +137,18 @@ static DEFINE_SPINLOCK(audit_freelist_lock); static int audit_freelist_count; static LIST_HEAD(audit_freelist); -static struct sk_buff_head audit_skb_queue; -/* queue of skbs to send to auditd when/if it comes back */ -static struct sk_buff_head audit_skb_hold_queue; +/* queue msgs to send via kauditd_task */ +static struct sk_buff_head audit_queue; +/* queue msgs due to temporary unicast send problems */ +static struct sk_buff_head audit_retry_queue; +/* queue msgs waiting for new auditd connection */ +static struct sk_buff_head audit_hold_queue; + +/* queue servicing thread */ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); + +/* waitqueue for callers who are blocked on the audit backlog */ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, @@ -338,7 +344,7 @@ static int audit_set_backlog_limit(u32 limit) static int audit_set_backlog_wait_time(u32 timeout) { return audit_do_config_change("audit_backlog_wait_time", - &audit_backlog_wait_time_master, timeout); + &audit_backlog_wait_time, timeout); } static int audit_set_enabled(u32 state) @@ -365,29 +371,10 @@ static int audit_set_failure(u32 state) } /* - * Queue skbs to be sent to auditd when/if it comes back. These skbs should - * already have been sent via prink/syslog and so if these messages are dropped - * it is not a huge concern since we already passed the audit_log_lost() - * notification and stuff. This is just nice to get audit messages during - * boot before auditd is running or messages generated while auditd is stopped. - * This only holds messages is audit_default is set, aka booting with audit=1 - * or building your kernel that way. - */ -static void audit_hold_skb(struct sk_buff *skb) -{ - if (audit_default && - (!audit_backlog_limit || - skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)) - skb_queue_tail(&audit_skb_hold_queue, skb); - else - kfree_skb(skb); -} - -/* * For one reason or another this nlh isn't getting delivered to the userspace * audit daemon, just send it to printk. */ -static void audit_printk_skb(struct sk_buff *skb) +static void kauditd_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); char *data = nlmsg_data(nlh); @@ -398,58 +385,123 @@ static void audit_printk_skb(struct sk_buff *skb) else audit_log_lost("printk limit exceeded"); } +} + +/** + * kauditd_hold_skb - Queue an audit record, waiting for auditd + * @skb: audit record + * + * Description: + * Queue the audit record, waiting for an instance of auditd. When this + * function is called we haven't given up yet on sending the record, but things + * are not looking good. The first thing we want to do is try to write the + * record via printk and then see if we want to try and hold on to the record + * and queue it, if we have room. If we want to hold on to the record, but we + * don't have room, record a record lost message. + */ +static void kauditd_hold_skb(struct sk_buff *skb) +{ + /* at this point it is uncertain if we will ever send this to auditd so + * try to send the message via printk before we go any further */ + kauditd_printk_skb(skb); + + /* can we just silently drop the message? */ + if (!audit_default) { + kfree_skb(skb); + return; + } + + /* if we have room, queue the message */ + if (!audit_backlog_limit || + skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_hold_queue, skb); + return; + } - audit_hold_skb(skb); + /* we have no other options - drop the message */ + audit_log_lost("kauditd hold queue overflow"); + kfree_skb(skb); } -static void kauditd_send_skb(struct sk_buff *skb) +/** + * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd + * @skb: audit record + * + * Description: + * Not as serious as kauditd_hold_skb() as we still have a connected auditd, + * but for some reason we are having problems sending it audit records so + * queue the given record and attempt to resend. + */ +static void kauditd_retry_skb(struct sk_buff *skb) { - int err; - int attempts = 0; -#define AUDITD_RETRIES 5 + /* NOTE: because records should only live in the retry queue for a + * short period of time, before either being sent or moved to the hold + * queue, we don't currently enforce a limit on this queue */ + skb_queue_tail(&audit_retry_queue, skb); +} + +/** + * auditd_reset - Disconnect the auditd connection + * + * Description: + * Break the auditd/kauditd connection and move all the records in the retry + * queue into the hold queue in case auditd reconnects. The audit_cmd_mutex + * must be held when calling this function. + */ +static void auditd_reset(void) +{ + struct sk_buff *skb; + + /* break the connection */ + if (audit_sock) { + sock_put(audit_sock); + audit_sock = NULL; + } + audit_pid = 0; + audit_nlk_portid = 0; + + /* flush all of the retry queue to the hold queue */ + while ((skb = skb_dequeue(&audit_retry_queue))) + kauditd_hold_skb(skb); +} + +/** + * kauditd_send_unicast_skb - Send a record via unicast to auditd + * @skb: audit record + */ +static int kauditd_send_unicast_skb(struct sk_buff *skb) +{ + int rc; -restart: - /* take a reference in case we can't send it and we want to hold it */ + /* if we know nothing is connected, don't even try the netlink call */ + if (!audit_pid) + return -ECONNREFUSED; + + /* get an extra skb reference in case we fail to send */ skb_get(skb); - err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); - if (err < 0) { - pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n", - audit_pid, err); - if (audit_pid) { - if (err == -ECONNREFUSED || err == -EPERM - || ++attempts >= AUDITD_RETRIES) { - char s[32]; - - snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid); - audit_log_lost(s); - audit_pid = 0; - audit_sock = NULL; - } else { - pr_warn("re-scheduling(#%d) write to audit_pid=%d\n", - attempts, audit_pid); - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - goto restart; - } - } - /* we might get lucky and get this in the next auditd */ - audit_hold_skb(skb); - } else - /* drop the extra reference if sent ok */ + rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); + if (rc >= 0) { consume_skb(skb); + rc = 0; + } + + return rc; } /* - * kauditd_send_multicast_skb - send the skb to multicast userspace listeners + * kauditd_send_multicast_skb - Send a record to any multicast listeners + * @skb: audit record * + * Description: * This function doesn't consume an skb as might be expected since it has to * copy it anyways. */ -static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask) +static void kauditd_send_multicast_skb(struct sk_buff *skb) { - struct sk_buff *copy; - struct audit_net *aunet = net_generic(&init_net, audit_net_id); - struct sock *sock = aunet->nlsk; + struct sk_buff *copy; + struct audit_net *aunet = net_generic(&init_net, audit_net_id); + struct sock *sock = aunet->nlsk; + struct nlmsghdr *nlh; if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) return; @@ -464,74 +516,161 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask) * no reason for new multicast clients to continue with this * non-compliance. */ - copy = skb_copy(skb, gfp_mask); + copy = skb_copy(skb, GFP_KERNEL); if (!copy) return; + nlh = nlmsg_hdr(copy); + nlh->nlmsg_len = skb->len; - nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask); + nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); } -/* - * flush_hold_queue - empty the hold queue if auditd appears - * - * If auditd just started, drain the queue of messages already - * sent to syslog/printk. Remember loss here is ok. We already - * called audit_log_lost() if it didn't go out normally. so the - * race between the skb_dequeue and the next check for audit_pid - * doesn't matter. +/** + * kauditd_wake_condition - Return true when it is time to wake kauditd_thread * - * If you ever find kauditd to be too slow we can get a perf win - * by doing our own locking and keeping better track if there - * are messages in this queue. I don't see the need now, but - * in 5 years when I want to play with this again I'll see this - * note and still have no friggin idea what i'm thinking today. + * Description: + * This function is for use by the wait_event_freezable() call in + * kauditd_thread(). */ -static void flush_hold_queue(void) +static int kauditd_wake_condition(void) { - struct sk_buff *skb; - - if (!audit_default || !audit_pid) - return; - - skb = skb_dequeue(&audit_skb_hold_queue); - if (likely(!skb)) - return; + static int pid_last = 0; + int rc; + int pid = audit_pid; - while (skb && audit_pid) { - kauditd_send_skb(skb); - skb = skb_dequeue(&audit_skb_hold_queue); - } + /* wake on new messages or a change in the connected auditd */ + rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last); + if (rc) + pid_last = pid; - /* - * if auditd just disappeared but we - * dequeued an skb we need to drop ref - */ - consume_skb(skb); + return rc; } static int kauditd_thread(void *dummy) { + int rc; + int auditd = 0; + int reschedule = 0; + struct sk_buff *skb; + struct nlmsghdr *nlh; + +#define UNICAST_RETRIES 5 +#define AUDITD_BAD(x,y) \ + ((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES) + + /* NOTE: we do invalidate the auditd connection flag on any sending + * errors, but we only "restore" the connection flag at specific places + * in the loop in order to help ensure proper ordering of audit + * records */ + set_freezable(); while (!kthread_should_stop()) { - struct sk_buff *skb; - - flush_hold_queue(); + /* NOTE: possible area for future improvement is to look at + * the hold and retry queues, since only this thread + * has access to these queues we might be able to do + * our own queuing and skip some/all of the locking */ + + /* NOTE: it might be a fun experiment to split the hold and + * retry queue handling to another thread, but the + * synchronization issues and other overhead might kill + * any performance gains */ + + /* attempt to flush the hold queue */ + while (auditd && (skb = skb_dequeue(&audit_hold_queue))) { + rc = kauditd_send_unicast_skb(skb); + if (rc) { + /* requeue to the same spot */ + skb_queue_head(&audit_hold_queue, skb); + + auditd = 0; + if (AUDITD_BAD(rc, reschedule)) { + mutex_lock(&audit_cmd_mutex); + auditd_reset(); + mutex_unlock(&audit_cmd_mutex); + reschedule = 0; + } + } else + /* we were able to send successfully */ + reschedule = 0; + } - skb = skb_dequeue(&audit_skb_queue); + /* attempt to flush the retry queue */ + while (auditd && (skb = skb_dequeue(&audit_retry_queue))) { + rc = kauditd_send_unicast_skb(skb); + if (rc) { + auditd = 0; + if (AUDITD_BAD(rc, reschedule)) { + kauditd_hold_skb(skb); + mutex_lock(&audit_cmd_mutex); + auditd_reset(); + mutex_unlock(&audit_cmd_mutex); + reschedule = 0; + } else + /* temporary problem (we hope), queue + * to the same spot and retry */ + skb_queue_head(&audit_retry_queue, skb); + } else + /* we were able to send successfully */ + reschedule = 0; + } + /* standard queue processing, try to be as quick as possible */ +quick_loop: + skb = skb_dequeue(&audit_queue); if (skb) { - if (!audit_backlog_limit || - (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)) - wake_up(&audit_backlog_wait); - if (audit_pid) - kauditd_send_skb(skb); + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* attempt to send to any multicast listeners */ + kauditd_send_multicast_skb(skb); + + /* attempt to send to auditd, queue on failure */ + if (auditd) { + rc = kauditd_send_unicast_skb(skb); + if (rc) { + auditd = 0; + if (AUDITD_BAD(rc, reschedule)) { + mutex_lock(&audit_cmd_mutex); + auditd_reset(); + mutex_unlock(&audit_cmd_mutex); + reschedule = 0; + } + + /* move to the retry queue */ + kauditd_retry_skb(skb); + } else + /* everything is working so go fast! */ + goto quick_loop; + } else if (reschedule) + /* we are currently having problems, move to + * the retry queue */ + kauditd_retry_skb(skb); else - audit_printk_skb(skb); - continue; - } + /* dump the message via printk and hold it */ + kauditd_hold_skb(skb); + } else { + /* we have flushed the backlog so wake everyone */ + wake_up(&audit_backlog_wait); + + /* if everything is okay with auditd (if present), go + * to sleep until there is something new in the queue + * or we have a change in the connected auditd; + * otherwise simply reschedule to give things a chance + * to recover */ + if (reschedule) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } else + wait_event_freezable(kauditd_wait, + kauditd_wake_condition()); - wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue)); + /* update the auditd connection status */ + auditd = (audit_pid ? 1 : 0); + } } + return 0; } @@ -596,6 +735,7 @@ static int audit_send_reply_thread(void *arg) kfree(reply); return 0; } + /** * audit_send_reply - send an audit reply message via netlink * @request_skb: skb of request we are replying to (used to target the reply) @@ -832,16 +972,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) return err; - /* As soon as there's any sign of userspace auditd, - * start kauditd to talk to it */ - if (!kauditd_task) { - kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) { - err = PTR_ERR(kauditd_task); - kauditd_task = NULL; - return err; - } - } seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); @@ -855,9 +985,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); - s.backlog = skb_queue_len(&audit_skb_queue); + s.backlog = skb_queue_len(&audit_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; - s.backlog_wait_time = audit_backlog_wait_time_master; + s.backlog_wait_time = audit_backlog_wait_time; audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } @@ -897,9 +1027,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, 1); - audit_pid = new_pid; - audit_nlk_portid = NETLINK_CB(skb).portid; - audit_sock = skb->sk; + if (new_pid) { + if (audit_sock) + sock_put(audit_sock); + audit_pid = new_pid; + audit_nlk_portid = NETLINK_CB(skb).portid; + sock_hold(skb->sk); + audit_sock = skb->sk; + } else { + auditd_reset(); + } + wake_up_interruptible(&kauditd_wait); } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); @@ -1167,10 +1305,10 @@ static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); struct sock *sock = aunet->nlsk; - if (sock == audit_sock) { - audit_pid = 0; - audit_sock = NULL; - } + mutex_lock(&audit_cmd_mutex); + if (sock == audit_sock) + auditd_reset(); + mutex_unlock(&audit_cmd_mutex); netlink_kernel_release(sock); aunet->nlsk = NULL; @@ -1195,17 +1333,24 @@ static int __init audit_init(void) audit_default ? "enabled" : "disabled"); register_pernet_subsys(&audit_net_ops); - skb_queue_head_init(&audit_skb_queue); - skb_queue_head_init(&audit_skb_hold_queue); + skb_queue_head_init(&audit_queue); + skb_queue_head_init(&audit_retry_queue); + skb_queue_head_init(&audit_hold_queue); audit_initialized = AUDIT_INITIALIZED; audit_enabled = audit_default; audit_ever_enabled |= !!audit_default; - audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); + kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); + if (IS_ERR(kauditd_task)) { + int err = PTR_ERR(kauditd_task); + panic("audit: failed to start the kauditd thread (%d)\n", err); + } + + audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); + return 0; } __initcall(audit_init); @@ -1338,24 +1483,6 @@ static inline void audit_get_stamp(struct audit_context *ctx, } } -/* - * Wait for auditd to drain the queue a little - */ -static long wait_for_auditd(long sleep_time) -{ - DECLARE_WAITQUEUE(wait, current); - - if (audit_backlog_limit && - skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { - add_wait_queue_exclusive(&audit_backlog_wait, &wait); - set_current_state(TASK_UNINTERRUPTIBLE); - sleep_time = schedule_timeout(sleep_time); - remove_wait_queue(&audit_backlog_wait, &wait); - } - - return sleep_time; -} - /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) @@ -1374,12 +1501,9 @@ static long wait_for_auditd(long sleep_time) struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { - struct audit_buffer *ab = NULL; - struct timespec t; - unsigned int uninitialized_var(serial); - int reserve = 5; /* Allow atomic callers to go up to five - entries over the normal backlog limit */ - unsigned long timeout_start = jiffies; + struct audit_buffer *ab; + struct timespec t; + unsigned int uninitialized_var(serial); if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1387,38 +1511,48 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) return NULL; - if (gfp_mask & __GFP_DIRECT_RECLAIM) { - if (audit_pid && audit_pid == current->tgid) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; - else - reserve = 0; - } - - while (audit_backlog_limit - && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { - long sleep_time; + /* don't ever fail/sleep on these two conditions: + * 1. auditd generated record - since we need auditd to drain the + * queue; also, when we are checking for auditd, compare PIDs using + * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() + * using a PID anchored in the caller's namespace + * 2. audit command message - record types 1000 through 1099 inclusive + * are command messages/records used to manage the kernel subsystem + * and the audit userspace, blocking on these messages could cause + * problems under load so don't do it (note: not all of these + * command types are valid as record types, but it is quicker to + * just check two ints than a series of ints in a if/switch stmt) */ + if (!((audit_pid && audit_pid == task_tgid_vnr(current)) || + (type >= 1000 && type <= 1099))) { + long sleep_time = audit_backlog_wait_time; + + while (audit_backlog_limit && + (skb_queue_len(&audit_queue) > audit_backlog_limit)) { + /* wake kauditd to try and flush the queue */ + wake_up_interruptible(&kauditd_wait); - sleep_time = timeout_start + audit_backlog_wait_time - jiffies; - if (sleep_time > 0) { - sleep_time = wait_for_auditd(sleep_time); - if (sleep_time > 0) - continue; + /* sleep if we are allowed and we haven't exhausted our + * backlog wait limit */ + if ((gfp_mask & __GFP_DIRECT_RECLAIM) && + (sleep_time > 0)) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&audit_backlog_wait, + &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + sleep_time = schedule_timeout(sleep_time); + remove_wait_queue(&audit_backlog_wait, &wait); + } else { + if (audit_rate_check() && printk_ratelimit()) + pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", + skb_queue_len(&audit_queue), + audit_backlog_limit); + audit_log_lost("backlog limit exceeded"); + return NULL; } } - if (audit_rate_check() && printk_ratelimit()) - pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", - skb_queue_len(&audit_skb_queue), - audit_backlog_limit); - audit_log_lost("backlog limit exceeded"); - audit_backlog_wait_time = 0; - wake_up(&audit_backlog_wait); - return NULL; } - if (!reserve && !audit_backlog_wait_time) - audit_backlog_wait_time = audit_backlog_wait_time_master; - ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); @@ -1426,9 +1560,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); - audit_log_format(ab, "audit(%lu.%03lu:%u): ", t.tv_sec, t.tv_nsec/1000000, serial); + return ab; } @@ -1759,7 +1893,7 @@ void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, * @call_panic: optional pointer to int that will be updated if secid fails */ void audit_log_name(struct audit_context *context, struct audit_names *n, - struct path *path, int record_num, int *call_panic) + const struct path *path, int record_num, int *call_panic) { struct audit_buffer *ab; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); @@ -1947,7 +2081,7 @@ EXPORT_SYMBOL(audit_log_task_info); * @operation: specific link operation * @link: the path that triggered the restriction */ -void audit_log_link_denied(const char *operation, struct path *link) +void audit_log_link_denied(const char *operation, const struct path *link) { struct audit_buffer *ab; struct audit_names *name; @@ -1978,10 +2112,10 @@ out: * audit_log_end - end one audit record * @ab: the audit_buffer * - * netlink_unicast() cannot be called inside an irq context because it blocks - * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed - * on a queue and a tasklet is scheduled to remove them from the queue outside - * the irq context. May be called in any context. + * We can not do a netlink send inside an irq context because it blocks (last + * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a + * queue and a tasklet is scheduled to remove them from the queue outside the + * irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { @@ -1990,28 +2124,8 @@ void audit_log_end(struct audit_buffer *ab) if (!audit_rate_check()) { audit_log_lost("rate limit exceeded"); } else { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - - nlh->nlmsg_len = ab->skb->len; - kauditd_send_multicast_skb(ab->skb, ab->gfp_mask); - - /* - * The original kaudit unicast socket sends up messages with - * nlmsg_len set to the payload length rather than the entire - * message length. This breaks the standard set by netlink. - * The existing auditd daemon assumes this breakage. Fixing - * this would require co-ordinating a change in the established - * protocol between the kaudit kernel subsystem and the auditd - * userspace code. - */ - nlh->nlmsg_len -= NLMSG_HDRLEN; - - if (audit_pid) { - skb_queue_tail(&audit_skb_queue, ab->skb); - wake_up_interruptible(&kauditd_wait); - } else { - audit_printk_skb(ab->skb); - } + skb_queue_tail(&audit_queue, ab->skb); + wake_up_interruptible(&kauditd_wait); ab->skb = NULL; } audit_buffer_free(ab); diff --git a/kernel/audit.h b/kernel/audit.h index 431444c3708b..960d49c9db5e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -212,7 +212,7 @@ extern void audit_copy_inode(struct audit_names *name, extern void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap); extern void audit_log_name(struct audit_context *context, - struct audit_names *n, struct path *path, + struct audit_names *n, const struct path *path, int record_num, int *call_panic); extern int audit_pid; diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index f84f8d06e1f6..7ea57e516029 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -74,7 +74,7 @@ int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_ } static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, - struct inode *inode) + const struct inode *inode) { audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET; audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; @@ -130,10 +130,9 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "auid=%u ses=%u op=", + audit_log_format(ab, "auid=%u ses=%u op=%s", from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current)); - audit_log_string(ab, op); + audit_get_sessionid(current), op); audit_log_format(ab, " path="); audit_log_untrustedstring(ab, audit_mark->path); audit_log_key(ab, rule->filterkey); @@ -168,11 +167,11 @@ static int audit_mark_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *dname, u32 cookie) { struct audit_fsnotify_mark *audit_mark; - struct inode *inode = NULL; + const struct inode *inode = NULL; audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); @@ -180,10 +179,10 @@ static int audit_mark_handle_event(struct fsnotify_group *group, switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = ((struct path *)data)->dentry->d_inode; + inode = ((const struct path *)data)->dentry->d_inode; break; case (FSNOTIFY_EVENT_INODE): - inode = (struct inode *)data; + inode = (const struct inode *)data; break; default: BUG(); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 25772476fa4a..7b44195da81b 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -231,9 +231,11 @@ static void untag_chunk(struct node *p) if (size) new = alloc_chunk(size); + mutex_lock(&entry->group->mark_mutex); spin_lock(&entry->lock); if (chunk->dead || !entry->inode) { spin_unlock(&entry->lock); + mutex_unlock(&entry->group->mark_mutex); if (new) free_chunk(new); goto out; @@ -251,6 +253,7 @@ static void untag_chunk(struct node *p) list_del_rcu(&chunk->hash); spin_unlock(&hash_lock); spin_unlock(&entry->lock); + mutex_unlock(&entry->group->mark_mutex); fsnotify_destroy_mark(entry, audit_tree_group); goto out; } @@ -258,8 +261,8 @@ static void untag_chunk(struct node *p) if (!new) goto Fallback; - fsnotify_duplicate_mark(&new->mark, entry); - if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) { + if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode, + NULL, 1)) { fsnotify_put_mark(&new->mark); goto Fallback; } @@ -293,6 +296,7 @@ static void untag_chunk(struct node *p) owner->root = new; spin_unlock(&hash_lock); spin_unlock(&entry->lock); + mutex_unlock(&entry->group->mark_mutex); fsnotify_destroy_mark(entry, audit_tree_group); fsnotify_put_mark(&new->mark); /* drop initial reference */ goto out; @@ -309,6 +313,7 @@ Fallback: put_tree(owner); spin_unlock(&hash_lock); spin_unlock(&entry->lock); + mutex_unlock(&entry->group->mark_mutex); out: fsnotify_put_mark(entry); spin_lock(&hash_lock); @@ -386,18 +391,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk_entry = &chunk->mark; + mutex_lock(&old_entry->group->mark_mutex); spin_lock(&old_entry->lock); if (!old_entry->inode) { /* old_entry is being shot, lets just lie */ spin_unlock(&old_entry->lock); + mutex_unlock(&old_entry->group->mark_mutex); fsnotify_put_mark(old_entry); free_chunk(chunk); return -ENOENT; } - fsnotify_duplicate_mark(chunk_entry, old_entry); - if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) { + if (fsnotify_add_mark_locked(chunk_entry, old_entry->group, + old_entry->inode, NULL, 1)) { spin_unlock(&old_entry->lock); + mutex_unlock(&old_entry->group->mark_mutex); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); return -ENOSPC; @@ -413,6 +421,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk->dead = 1; spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); + mutex_unlock(&old_entry->group->mark_mutex); fsnotify_destroy_mark(chunk_entry, audit_tree_group); @@ -445,6 +454,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); + mutex_unlock(&old_entry->group->mark_mutex); fsnotify_destroy_mark(old_entry, audit_tree_group); fsnotify_put_mark(chunk_entry); /* drop initial reference */ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ @@ -458,8 +468,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "op="); - audit_log_string(ab, "remove_rule"); + audit_log_format(ab, "op=remove_rule"); audit_log_format(ab, " dir="); audit_log_untrustedstring(ab, rule->tree->pathname); audit_log_key(ab, rule->filterkey); @@ -948,7 +957,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0d302a87f21b..f79e4658433d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -242,10 +242,9 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "auid=%u ses=%u op=", + audit_log_format(ab, "auid=%u ses=%u op=%s", from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current)); - audit_log_string(ab, op); + audit_get_sessionid(current), op); audit_log_format(ab, " path="); audit_log_untrustedstring(ab, w->path); audit_log_key(ab, r->filterkey); @@ -472,10 +471,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *dname, u32 cookie) { - struct inode *inode; + const struct inode *inode; struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); @@ -484,10 +483,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group, switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = d_backing_inode(((struct path *)data)->dentry); + inode = d_backing_inode(((const struct path *)data)->dentry); break; case (FSNOTIFY_EVENT_INODE): - inode = (struct inode *)data; + inode = (const struct inode *)data; break; default: BUG(); @@ -548,8 +547,8 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) exe_file = get_task_exe_file(tsk); if (!exe_file) return 0; - ino = exe_file->f_inode->i_ino; - dev = exe_file->f_inode->i_sb->s_dev; + ino = file_inode(exe_file)->i_ino; + dev = file_inode(exe_file)->i_sb->s_dev; fput(exe_file); return audit_mark_compare(mark, ino, dev); } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 85d9cac497e4..880519d6cf2a 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -363,6 +363,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_EXIT: case AUDIT_SUCCESS: case AUDIT_INODE: + case AUDIT_SESSIONID: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; @@ -476,6 +477,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (!gid_valid(f->gid)) goto exit_free; break; + case AUDIT_SESSIONID: case AUDIT_ARCH: entry->rule.arch_f = f; break; @@ -1074,8 +1076,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re return; audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid); audit_log_task_context(ab); - audit_log_format(ab, " op="); - audit_log_string(ab, action); + audit_log_format(ab, " op=%s", action); audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=%d", rule->listnr, res); audit_log_end(ab); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2cd5256dbff7..cf1fa43512c1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -446,6 +446,7 @@ static int audit_filter_rules(struct task_struct *tsk, const struct cred *cred; int i, need_sid = 1; u32 sid; + unsigned int sessionid; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); @@ -508,6 +509,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_FSGID: result = audit_gid_comparator(cred->fsgid, f->op, f->gid); break; + case AUDIT_SESSIONID: + sessionid = audit_get_sessionid(current); + result = audit_comparator(sessionid, f->op, f->val); + break; case AUDIT_PERS: result = audit_comparator(tsk->personality, f->op, f->val); break; @@ -1000,7 +1005,7 @@ static void audit_log_execve_info(struct audit_context *context, long len_rem; long len_full; long len_buf; - long len_abuf; + long len_abuf = 0; long len_tmp; bool require_data; bool encode; @@ -2025,8 +2030,11 @@ int audit_set_loginuid(kuid_t loginuid) goto out; /* are we setting or clearing? */ - if (uid_valid(loginuid)) + if (uid_valid(loginuid)) { sessionid = (unsigned int)atomic_inc_return(&session_id); + if (unlikely(sessionid == (unsigned int)-1)) + sessionid = (unsigned int)atomic_inc_return(&session_id); + } task->sessionid = sessionid; task->loginuid = loginuid; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index a2ac051c342f..229a5d5df977 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -56,7 +56,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); - if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 83e0d153b0b4..503d4211988a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -105,19 +105,29 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; + u32 pages, delta; + int ret; BUG_ON(fp_old == NULL); size = round_up(size, PAGE_SIZE); - if (size <= fp_old->pages * PAGE_SIZE) + pages = size / PAGE_SIZE; + if (pages <= fp_old->pages) return fp_old; + delta = pages - fp_old->pages; + ret = __bpf_prog_charge(fp_old->aux->user, delta); + if (ret) + return NULL; + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); - if (fp != NULL) { + if (fp == NULL) { + __bpf_prog_uncharge(fp_old->aux->user, delta); + } else { kmemcheck_annotate_bitfield(fp, meta); memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); - fp->pages = size / PAGE_SIZE; + fp->pages = pages; fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new @@ -136,28 +146,30 @@ void __bpf_prog_free(struct bpf_prog *fp) vfree(fp); } -#define SHA_BPF_RAW_SIZE \ - round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES) - -/* Called under verifier mutex. */ -void bpf_prog_calc_digest(struct bpf_prog *fp) +int bpf_prog_calc_tag(struct bpf_prog *fp) { const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); - static u32 ws[SHA_WORKSPACE_WORDS]; - static u8 raw[SHA_BPF_RAW_SIZE]; - struct bpf_insn *dst = (void *)raw; + u32 raw_size = bpf_prog_tag_scratch_size(fp); + u32 digest[SHA_DIGEST_WORDS]; + u32 ws[SHA_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; + struct bpf_insn *dst; bool was_ld_map; - u8 *todo = raw; + u8 *raw, *todo; __be32 *result; __be64 *bits; - sha_init(fp->digest); + raw = vmalloc(raw_size); + if (!raw) + return -ENOMEM; + + sha_init(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ + dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && @@ -177,12 +189,13 @@ void bpf_prog_calc_digest(struct bpf_prog *fp) } } - psize = fp->len * sizeof(struct bpf_insn); - memset(&raw[psize], 0, sizeof(raw) - psize); + psize = bpf_prog_insn_size(fp); + memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; bsize = round_up(psize, SHA_MESSAGE_BYTES); blocks = bsize / SHA_MESSAGE_BYTES; + todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); } else { @@ -192,13 +205,17 @@ void bpf_prog_calc_digest(struct bpf_prog *fp) *bits = cpu_to_be64((psize - 1) << 3); while (blocks--) { - sha_transform(fp->digest, todo, ws); + sha_transform(digest, todo, ws); todo += SHA_MESSAGE_BYTES; } - result = (__force __be32 *)fp->digest; + result = (__force __be32 *)digest; for (i = 0; i < SHA_DIGEST_WORDS; i++) - result[i] = cpu_to_be32(fp->digest[i]); + result[i] = cpu_to_be32(digest[i]); + memcpy(fp->tag, result, sizeof(fp->tag)); + + vfree(raw); + return 0; } static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 34debc1a9641..3f2bb58952d8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -274,7 +274,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; - if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) - + if (htab->map.value_size >= KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct htab_elem)) /* if value_size is bigger, the user space won't be able to * access the elements via bpf syscall. This check also makes diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4819ec9d95f6..1d6b29e4e2c3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -615,19 +615,39 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +int __bpf_prog_charge(struct user_struct *user, u32 pages) +{ + unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + unsigned long user_bufs; + + if (user) { + user_bufs = atomic_long_add_return(pages, &user->locked_vm); + if (user_bufs > memlock_limit) { + atomic_long_sub(pages, &user->locked_vm); + return -EPERM; + } + } + + return 0; +} + +void __bpf_prog_uncharge(struct user_struct *user, u32 pages) +{ + if (user) + atomic_long_sub(pages, &user->locked_vm); +} + static int bpf_prog_charge_memlock(struct bpf_prog *prog) { struct user_struct *user = get_current_user(); - unsigned long memlock_limit; - - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + int ret; - atomic_long_add(prog->pages, &user->locked_vm); - if (atomic_long_read(&user->locked_vm) > memlock_limit) { - atomic_long_sub(prog->pages, &user->locked_vm); + ret = __bpf_prog_charge(user, prog->pages); + if (ret) { free_uid(user); - return -EPERM; + return ret; } + prog->aux->user = user; return 0; } @@ -636,7 +656,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) { struct user_struct *user = prog->aux->user; - atomic_long_sub(prog->pages, &user->locked_vm); + __bpf_prog_uncharge(user, prog->pages); free_uid(user); } @@ -668,17 +688,17 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_prog *prog = filp->private_data; - char prog_digest[sizeof(prog->digest) * 2 + 1] = { }; + char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - bin2hex(prog_digest, prog->digest, sizeof(prog->digest)); + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "prog_type:\t%u\n" "prog_jited:\t%u\n" - "prog_digest:\t%s\n" + "prog_tag:\t%s\n" "memlock:\t%llu\n", prog->type, prog->jited, - prog_digest, + prog_tag, prog->pages * 1ULL << PAGE_SHIFT); } #endif @@ -811,7 +831,7 @@ static int bpf_prog_load(union bpf_attr *attr) err = -EFAULT; if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), - prog->len * sizeof(struct bpf_insn)) != 0) + bpf_prog_insn_size(prog)) != 0) goto free_prog; prog->orig_prog = NULL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d28f9a3380a9..cdc43b899f28 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -462,14 +462,19 @@ static void init_reg_state(struct bpf_reg_state *regs) regs[BPF_REG_1].type = PTR_TO_CTX; } -static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) +static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) { - BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; regs[regno].id = 0; regs[regno].imm = 0; } +static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + __mark_reg_unknown_value(regs, regno); +} + static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; @@ -1970,8 +1975,13 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { reg->type = type; + /* We don't need id from this point onwards anymore, thus we + * should better reset it, so that state pruning has chances + * to take effect. + */ + reg->id = 0; if (type == UNKNOWN_VALUE) - mark_reg_unknown_value(regs, regno); + __mark_reg_unknown_value(regs, regno); } } @@ -1982,16 +1992,16 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, enum bpf_reg_type type) { struct bpf_reg_state *regs = state->regs; + u32 id = regs[regno].id; int i; for (i = 0; i < MAX_BPF_REG; i++) - mark_map_reg(regs, i, regs[regno].id, type); + mark_map_reg(regs, i, id, type); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; - mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, - regs[regno].id, type); + mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type); } } @@ -2926,6 +2936,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i, j, err; + err = bpf_prog_calc_tag(env->prog); + if (err) + return err; + for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { @@ -3173,8 +3187,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log_level = 0; } - bpf_prog_calc_digest(env->prog); - ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; diff --git a/kernel/capability.c b/kernel/capability.c index 00411c82dac5..f97fe77ceb88 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -17,7 +17,7 @@ #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* * Leveraged for setting/resetting capabilities @@ -318,6 +318,7 @@ bool has_capability(struct task_struct *t, int cap) { return has_ns_capability(t, &init_user_ns, cap); } +EXPORT_SYMBOL(has_capability); /** * has_ns_capability_noaudit - Does a task have a capability (unaudited) @@ -457,6 +458,19 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns, EXPORT_SYMBOL(file_ns_capable); /** + * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? + * @ns: The user namespace in question + * @inode: The inode in question + * + * Return true if the inode uid and gid are within the namespace. + */ +bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode) +{ + return kuid_has_mapping(ns, inode->i_uid) && + kgid_has_mapping(ns, inode->i_gid); +} + +/** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question * @cap: The capability in question @@ -469,7 +483,26 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); - return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && - kgid_has_mapping(ns, inode->i_gid); + return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); + +/** + * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace + * @tsk: The task that may be ptraced + * @ns: The user namespace to search for CAP_SYS_PTRACE in + * + * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE + * in the specified user namespace. + */ +bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) +{ + int ret = 0; /* An absent tracer adds no restrictions */ + const struct cred *cred; + rcu_read_lock(); + cred = rcu_dereference(tsk->ptracer_cred); + if (cred) + ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE); + rcu_read_unlock(); + return (ret == 0); +} diff --git a/kernel/compat.c b/kernel/compat.c index b3a047f208a7..19aec5d98108 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -28,7 +28,7 @@ #include <linux/ptrace.h> #include <linux/gfp.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) { diff --git a/kernel/configs.c b/kernel/configs.c index c18b1f1ae515..2df132b20217 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -28,7 +28,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /**************************************************/ /* the actual current config file */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 217fd2e7f435..c47506357519 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -183,23 +183,16 @@ EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen); /* * The following two APIs (cpu_maps_update_begin/done) must be used when * attempting to serialize the updates to cpu_online_mask & cpu_present_mask. - * The APIs cpu_notifier_register_begin/done() must be used to protect CPU - * hotplug callback (un)registration performed using __register_cpu_notifier() - * or __unregister_cpu_notifier(). */ void cpu_maps_update_begin(void) { mutex_lock(&cpu_add_remove_lock); } -EXPORT_SYMBOL(cpu_notifier_register_begin); void cpu_maps_update_done(void) { mutex_unlock(&cpu_add_remove_lock); } -EXPORT_SYMBOL(cpu_notifier_register_done); - -static RAW_NOTIFIER_HEAD(cpu_chain); /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock @@ -349,66 +342,7 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -/* Need to know about CPUs going up/down? */ -int register_cpu_notifier(struct notifier_block *nb) -{ - int ret; - cpu_maps_update_begin(); - ret = raw_notifier_chain_register(&cpu_chain, nb); - cpu_maps_update_done(); - return ret; -} - -int __register_cpu_notifier(struct notifier_block *nb) -{ - return raw_notifier_chain_register(&cpu_chain, nb); -} - -static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call, - int *nr_calls) -{ - unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0; - void *hcpu = (void *)(long)cpu; - - int ret; - - ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call, - nr_calls); - - return notifier_to_errno(ret); -} - -static int cpu_notify(unsigned long val, unsigned int cpu) -{ - return __cpu_notify(val, cpu, -1, NULL); -} - -static void cpu_notify_nofail(unsigned long val, unsigned int cpu) -{ - BUG_ON(cpu_notify(val, cpu)); -} - /* Notifier wrappers for transitioning to state machine */ -static int notify_prepare(unsigned int cpu) -{ - int nr_calls = 0; - int ret; - - ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls); - if (ret) { - nr_calls--; - printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", - __func__, cpu); - __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL); - } - return ret; -} - -static int notify_online(unsigned int cpu) -{ - cpu_notify(CPU_ONLINE, cpu); - return 0; -} static int bringup_wait_for_ap(unsigned int cpu) { @@ -433,10 +367,8 @@ static int bringup_cpu(unsigned int cpu) /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); irq_unlock_sparse(); - if (ret) { - cpu_notify(CPU_UP_CANCELED, cpu); + if (ret) return ret; - } ret = bringup_wait_for_ap(cpu); BUG_ON(!cpu_online(cpu)); return ret; @@ -565,11 +497,6 @@ static void cpuhp_thread_fun(unsigned int cpu) BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); undo_cpu_down(cpu, st); - /* - * This is a momentary workaround to keep the notifier users - * happy. Will go away once we got rid of the notifiers. - */ - cpu_notify_nofail(CPU_DOWN_FAILED, cpu); st->rollback = false; } else { /* Cannot happen .... */ @@ -659,22 +586,6 @@ void __init cpuhp_threads_init(void) kthread_unpark(this_cpu_read(cpuhp_state.thread)); } -EXPORT_SYMBOL(register_cpu_notifier); -EXPORT_SYMBOL(__register_cpu_notifier); -void unregister_cpu_notifier(struct notifier_block *nb) -{ - cpu_maps_update_begin(); - raw_notifier_chain_unregister(&cpu_chain, nb); - cpu_maps_update_done(); -} -EXPORT_SYMBOL(unregister_cpu_notifier); - -void __unregister_cpu_notifier(struct notifier_block *nb) -{ - raw_notifier_chain_unregister(&cpu_chain, nb); -} -EXPORT_SYMBOL(__unregister_cpu_notifier); - #ifdef CONFIG_HOTPLUG_CPU /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU @@ -741,20 +652,6 @@ static inline void check_for_tasks(int dead_cpu) read_unlock(&tasklist_lock); } -static int notify_down_prepare(unsigned int cpu) -{ - int err, nr_calls = 0; - - err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls); - if (err) { - nr_calls--; - __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL); - pr_warn("%s: attempt to take down CPU %u failed\n", - __func__, cpu); - } - return err; -} - /* Take this CPU down. */ static int take_cpu_down(void *_param) { @@ -833,13 +730,6 @@ static int takedown_cpu(unsigned int cpu) return 0; } -static int notify_dead(unsigned int cpu) -{ - cpu_notify_nofail(CPU_DEAD, cpu); - check_for_tasks(cpu); - return 0; -} - static void cpuhp_complete_idle_dead(void *arg) { struct cpuhp_cpu_state *st = arg; @@ -863,9 +753,7 @@ void cpuhp_report_idle_dead(void) } #else -#define notify_down_prepare NULL #define takedown_cpu NULL -#define notify_dead NULL #endif #ifdef CONFIG_HOTPLUG_CPU @@ -924,9 +812,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; out: cpu_hotplug_done(); - /* This post dead nonsense must die */ - if (!ret && hasdied) - cpu_notify_nofail(CPU_POST_DEAD, cpu); return ret; } @@ -1292,17 +1177,6 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown.single = rcutree_dead_cpu, }, /* - * Preparatory and dead notifiers. Will be replaced once the notifiers - * are converted to states. - */ - [CPUHP_NOTIFY_PREPARE] = { - .name = "notify:prepare", - .startup.single = notify_prepare, - .teardown.single = notify_dead, - .skip_onerr = true, - .cant_stop = true, - }, - /* * On the tear-down path, timers_dead_cpu() must be invoked * before blk_mq_queue_reinit_notify() from notify_dead(), * otherwise a RCU stall occurs. @@ -1391,17 +1265,6 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup.single = rcutree_online_cpu, .teardown.single = rcutree_offline_cpu, }, - - /* - * Online/down_prepare notifiers. Will be removed once the notifiers - * are converted to states. - */ - [CPUHP_AP_NOTIFY_ONLINE] = { - .name = "notify:online", - .startup.single = notify_online, - .teardown.single = notify_down_prepare, - .skip_onerr = true, - }, #endif /* * The dynamically registered state space is here @@ -1432,23 +1295,67 @@ static int cpuhp_cb_check(enum cpuhp_state state) return 0; } -static void cpuhp_store_callbacks(enum cpuhp_state state, - const char *name, - int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu), - bool multi_instance) +/* + * Returns a free for dynamic slot assignment of the Online state. The states + * are protected by the cpuhp_slot_states mutex and an empty slot is identified + * by having no name assigned. + */ +static int cpuhp_reserve_state(enum cpuhp_state state) +{ + enum cpuhp_state i, end; + struct cpuhp_step *step; + + switch (state) { + case CPUHP_AP_ONLINE_DYN: + step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN; + end = CPUHP_AP_ONLINE_DYN_END; + break; + case CPUHP_BP_PREPARE_DYN: + step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN; + end = CPUHP_BP_PREPARE_DYN_END; + break; + default: + return -EINVAL; + } + + for (i = state; i <= end; i++, step++) { + if (!step->name) + return i; + } + WARN(1, "No more dynamic states available for CPU hotplug\n"); + return -ENOSPC; +} + +static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance) { /* (Un)Install the callbacks for further cpu hotplug operations */ struct cpuhp_step *sp; + int ret = 0; mutex_lock(&cpuhp_state_mutex); + + if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) { + ret = cpuhp_reserve_state(state); + if (ret < 0) + goto out; + state = ret; + } sp = cpuhp_get_step(state); + if (name && sp->name) { + ret = -EBUSY; + goto out; + } sp->startup.single = startup; sp->teardown.single = teardown; sp->name = name; sp->multi_instance = multi_instance; INIT_HLIST_HEAD(&sp->list); +out: mutex_unlock(&cpuhp_state_mutex); + return ret; } static void *cpuhp_get_teardown_cb(enum cpuhp_state state) @@ -1509,29 +1416,6 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, } } -/* - * Returns a free for dynamic slot assignment of the Online state. The states - * are protected by the cpuhp_slot_states mutex and an empty slot is identified - * by having no name assigned. - */ -static int cpuhp_reserve_state(enum cpuhp_state state) -{ - enum cpuhp_state i; - - mutex_lock(&cpuhp_state_mutex); - for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) { - if (cpuhp_ap_states[i].name) - continue; - - cpuhp_ap_states[i].name = "Reserved"; - mutex_unlock(&cpuhp_state_mutex); - return i; - } - mutex_unlock(&cpuhp_state_mutex); - WARN(1, "No more dynamic states available for CPU hotplug\n"); - return -ENOSPC; -} - int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke) { @@ -1580,13 +1464,19 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); /** * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state - * @state: The state to setup - * @invoke: If true, the startup function is invoked for cpus where - * cpu state >= @state - * @startup: startup callback function - * @teardown: teardown callback function + * @state: The state to setup + * @invoke: If true, the startup function is invoked for cpus where + * cpu state >= @state + * @startup: startup callback function + * @teardown: teardown callback function + * @multi_instance: State is set up for multiple instances which get + * added afterwards. * - * Returns 0 if successful, otherwise a proper error code + * Returns: + * On success: + * Positive state number if @state is CPUHP_AP_ONLINE_DYN + * 0 for all other states + * On failure: proper (negative) error code */ int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, @@ -1595,25 +1485,23 @@ int __cpuhp_setup_state(enum cpuhp_state state, bool multi_instance) { int cpu, ret = 0; - int dyn_state = 0; + bool dynstate; if (cpuhp_cb_check(state) || !name) return -EINVAL; get_online_cpus(); - /* currently assignments for the ONLINE state are possible */ - if (state == CPUHP_AP_ONLINE_DYN) { - dyn_state = 1; - ret = cpuhp_reserve_state(state); - if (ret < 0) - goto out; + ret = cpuhp_store_callbacks(state, name, startup, teardown, + multi_instance); + + dynstate = state == CPUHP_AP_ONLINE_DYN; + if (ret > 0 && dynstate) { state = ret; + ret = 0; } - cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); - - if (!invoke || !startup) + if (ret || !invoke || !startup) goto out; /* @@ -1637,7 +1525,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, } out: put_online_cpus(); - if (!ret && dyn_state) + /* + * If the requested state is CPUHP_AP_ONLINE_DYN, return the + * dynamically allocated state in case of success. + */ + if (!ret && dynstate) return state; return ret; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 29f815d2ef7e..b3088886cd37 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -55,7 +55,7 @@ #include <linux/backing-dev.h> #include <linux/sort.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/atomic.h> #include <linux/mutex.h> #include <linux/cgroup.h> diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0874e2edd275..79517e5549f1 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -598,11 +598,11 @@ return_normal: /* * Wait for the other CPUs to be notified and be waiting for us: */ - time_left = loops_per_jiffy * HZ; + time_left = MSEC_PER_SEC; while (kgdb_do_roundup && --time_left && (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != online_cpus) - cpu_relax(); + udelay(1000); if (!time_left) pr_crit("Timed out waiting for secondary CPUs.\n"); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 98c9011eac78..e74be38245ad 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -30,6 +30,7 @@ char kdb_prompt_str[CMD_BUFLEN]; int kdb_trap_printk; +int kdb_printf_cpu = -1; static int kgdb_transition_check(char *buffer) { @@ -554,31 +555,26 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int linecount; int colcount; int logging, saved_loglevel = 0; - int saved_trap_printk; - int got_printf_lock = 0; int retlen = 0; int fnd, len; + int this_cpu, old_cpu; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; struct console *c = console_drivers; - static DEFINE_SPINLOCK(kdb_printf_lock); unsigned long uninitialized_var(flags); - preempt_disable(); - saved_trap_printk = kdb_trap_printk; - kdb_trap_printk = 0; - /* Serialize kdb_printf if multiple cpus try to write at once. * But if any cpu goes recursive in kdb, just print the output, * even if it is interleaved with any other text. */ - if (!KDB_STATE(PRINTF_LOCK)) { - KDB_STATE_SET(PRINTF_LOCK); - spin_lock_irqsave(&kdb_printf_lock, flags); - got_printf_lock = 1; - atomic_inc(&kdb_event); - } else { - __acquire(kdb_printf_lock); + local_irq_save(flags); + this_cpu = smp_processor_id(); + for (;;) { + old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu); + if (old_cpu == -1 || old_cpu == this_cpu) + break; + + cpu_relax(); } diag = kdbgetintenv("LINES", &linecount); @@ -847,16 +843,9 @@ kdb_print_out: suspend_grep = 0; /* end of what may have been a recursive call */ if (logging) console_loglevel = saved_loglevel; - if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) { - got_printf_lock = 0; - spin_unlock_irqrestore(&kdb_printf_lock, flags); - KDB_STATE_CLEAR(PRINTF_LOCK); - atomic_dec(&kdb_event); - } else { - __release(kdb_printf_lock); - } - kdb_trap_printk = saved_trap_printk; - preempt_enable(); + /* kdb_printf_cpu locked the code above. */ + smp_store_release(&kdb_printf_cpu, old_cpu); + local_irq_restore(flags); return retlen; } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2a20c0dfdafc..ca183919d302 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -60,7 +60,6 @@ int kdb_grep_trailing; * Kernel debugger state flags */ int kdb_flags; -atomic_t kdb_event; /* * kdb_lock protects updates to kdb_initial_cpu. Used to diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 75014d7f4568..fc224fbcf954 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -132,7 +132,6 @@ extern int kdb_state; #define KDB_STATE_PAGER 0x00000400 /* pager is available */ #define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching * back to initial cpu */ -#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */ #define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ #define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ #define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been diff --git a/kernel/events/core.c b/kernel/events/core.c index faf073d0287f..110b38a58493 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2249,7 +2249,7 @@ static int __perf_install_in_context(void *info) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; - bool activate = true; + bool reprogram = true; int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); @@ -2257,27 +2257,26 @@ static int __perf_install_in_context(void *info) raw_spin_lock(&ctx->lock); task_ctx = ctx; - /* If we're on the wrong CPU, try again */ - if (task_cpu(ctx->task) != smp_processor_id()) { - ret = -ESRCH; - goto unlock; - } + reprogram = (ctx->task == current); /* - * If we're on the right CPU, see if the task we target is - * current, if not we don't have to activate the ctx, a future - * context switch will do that for us. + * If the task is running, it must be running on this CPU, + * otherwise we cannot reprogram things. + * + * If its not running, we don't care, ctx->lock will + * serialize against it becoming runnable. */ - if (ctx->task != current) - activate = false; - else - WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); + if (task_curr(ctx->task) && !reprogram) { + ret = -ESRCH; + goto unlock; + } + WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } - if (activate) { + if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx); @@ -2328,13 +2327,36 @@ perf_install_in_context(struct perf_event_context *ctx, /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. + * + * Instead we use task_curr(), which tells us if the task is running. + * However, since we use task_curr() outside of rq::lock, we can race + * against the actual state. This means the result can be wrong. + * + * If we get a false positive, we retry, this is harmless. + * + * If we get a false negative, things are complicated. If we are after + * perf_event_context_sched_in() ctx::lock will serialize us, and the + * value must be correct. If we're before, it doesn't matter since + * perf_event_context_sched_in() will program the counter. + * + * However, this hinges on the remote context switch having observed + * our task->perf_event_ctxp[] store, such that it will in fact take + * ctx::lock in perf_event_context_sched_in(). + * + * We do this by task_function_call(), if the IPI fails to hit the task + * we know any future context switch of task must see the + * perf_event_ctpx[] store. */ -again: + /* - * Cannot use task_function_call() because we need to run on the task's - * CPU regardless of whether its current or not. + * This smp_mb() orders the task->perf_event_ctxp[] store with the + * task_cpu() load, such that if the IPI then does not find the task + * running, a future context switch of that task must observe the + * store. */ - if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) + smp_mb(); +again: + if (!task_function_call(task, __perf_install_in_context, event)) return; raw_spin_lock_irq(&ctx->lock); @@ -2348,12 +2370,16 @@ again: raw_spin_unlock_irq(&ctx->lock); return; } - raw_spin_unlock_irq(&ctx->lock); /* - * Since !ctx->is_active doesn't mean anything, we must IPI - * unconditionally. + * If the task is not running, ctx->lock will avoid it becoming so, + * thus we can safely install the event. */ - goto again; + if (task_curr(task)) { + raw_spin_unlock_irq(&ctx->lock); + goto again; + } + add_event_to_ctx(event, ctx); + raw_spin_unlock_irq(&ctx->lock); } /* @@ -6698,7 +6724,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { - if (filter->inode != file->f_inode) + if (filter->inode != file_inode(file)) return false; if (filter->offset > offset + size) @@ -7034,25 +7060,12 @@ static void perf_log_itrace_start(struct perf_event *event) perf_output_end(&handle); } -/* - * Generic event overflow handling, sampling. - */ - -static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) +static int +__perf_event_account_interrupt(struct perf_event *event, int throttle) { - int events = atomic_read(&event->event_limit); struct hw_perf_event *hwc = &event->hw; - u64 seq; int ret = 0; - - /* - * Non-sampling counters might still use the PMI to fold short - * hardware counters, ignore those. - */ - if (unlikely(!is_sampling_event(event))) - return 0; + u64 seq; seq = __this_cpu_read(perf_throttled_seq); if (seq != hwc->interrupts_seq) { @@ -7080,6 +7093,34 @@ static int __perf_event_overflow(struct perf_event *event, perf_adjust_period(event, delta, hwc->last_period, true); } + return ret; +} + +int perf_event_account_interrupt(struct perf_event *event) +{ + return __perf_event_account_interrupt(event, 1); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + int ret = 0; + + /* + * Non-sampling counters might still use the PMI to fold short + * hardware counters, ignore those. + */ + if (unlikely(!is_sampling_event(event))) + return 0; + + ret = __perf_event_account_interrupt(event, throttle); + /* * XXX event_limit might not quite work as expected on inherited * events @@ -9503,6 +9544,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) return 0; } +/* + * Variation on perf_event_ctx_lock_nested(), except we take two context + * mutexes. + */ +static struct perf_event_context * +__perf_event_ctx_lock_double(struct perf_event *group_leader, + struct perf_event_context *ctx) +{ + struct perf_event_context *gctx; + +again: + rcu_read_lock(); + gctx = READ_ONCE(group_leader->ctx); + if (!atomic_inc_not_zero(&gctx->refcount)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + mutex_lock_double(&gctx->mutex, &ctx->mutex); + + if (group_leader->ctx != gctx) { + mutex_unlock(&ctx->mutex); + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + goto again; + } + + return gctx; +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -9746,12 +9818,31 @@ SYSCALL_DEFINE5(perf_event_open, } if (move_group) { - gctx = group_leader->ctx; - mutex_lock_double(&gctx->mutex, &ctx->mutex); + gctx = __perf_event_ctx_lock_double(group_leader, ctx); + if (gctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_locked; } + + /* + * Check if we raced against another sys_perf_event_open() call + * moving the software group underneath us. + */ + if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + /* + * If someone moved the group out from under us, check + * if this new event wound up on the same ctx, if so + * its the regular !move_group case, otherwise fail. + */ + if (gctx != ctx) { + err = -EINVAL; + goto err_locked; + } else { + perf_event_ctx_unlock(group_leader, gctx); + move_group = 0; + } + } } else { mutex_lock(&ctx->mutex); } @@ -9853,7 +9944,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_unpin_context(ctx); if (move_group) - mutex_unlock(&gctx->mutex); + perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); if (task) { @@ -9879,7 +9970,7 @@ SYSCALL_DEFINE5(perf_event_open, err_locked: if (move_group) - mutex_unlock(&gctx->mutex); + perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); /* err_file: */ fput(event_file); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f9ec9add2164..d416f3baf392 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, - &vma); + &vma, NULL); if (ret <= 0) return ret; @@ -1194,7 +1194,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); atomic_set(&area->slot_count, 1); - copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); + arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); if (!xol_add_vma(mm, area)) return area; @@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * essentially a kernel access to the memory. */ result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, - NULL); + NULL, NULL); if (result < 0) return result; diff --git a/kernel/exit.c b/kernel/exit.c index aacff8e2aec0..8f14b866f9f6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -56,7 +56,7 @@ #include <linux/kcov.h> #include <linux/random.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/pgtable.h> #include <asm/mmu_context.h> diff --git a/kernel/extable.c b/kernel/extable.c index e820ccee9846..e3beec4a2339 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -22,7 +22,7 @@ #include <linux/init.h> #include <asm/sections.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* * mutex protecting text section modification (dynamic code patching). diff --git a/kernel/fork.c b/kernel/fork.c index a439ac429669..11c5c8ab827c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -79,7 +79,7 @@ #include <asm/pgtable.h> #include <asm/pgalloc.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -747,7 +747,8 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } -static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) +static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + struct user_namespace *user_ns) { mm->mmap = NULL; mm->mm_rb = RB_ROOT; @@ -787,6 +788,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) if (init_new_context(p, mm)) goto fail_nocontext; + mm->user_ns = get_user_ns(user_ns); return mm; fail_nocontext: @@ -832,7 +834,7 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - return mm_init(mm, current); + return mm_init(mm, current, current_user_ns()); } /* @@ -847,6 +849,7 @@ void __mmdrop(struct mm_struct *mm) destroy_context(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); + put_user_ns(mm->user_ns); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1128,7 +1131,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); - if (!mm_init(mm, tsk)) + if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; err = dup_mmap(mm, oldmm); diff --git a/kernel/futex.c b/kernel/futex.c index 9246d9f593d1..0842c8ca534b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2459,7 +2459,7 @@ retry: restart->fn = futex_wait_restart; restart->futex.uaddr = uaddr; restart->futex.val = val; - restart->futex.time = abs_time->tv64; + restart->futex.time = *abs_time; restart->futex.bitset = bitset; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; @@ -2480,7 +2480,7 @@ static long futex_wait_restart(struct restart_block *restart) ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t.tv64 = restart->futex.time; + t = restart->futex.time; tp = &t; } restart->fn = do_no_restart_syscall; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 4ae3232e7a28..3f409968e466 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -13,7 +13,7 @@ #include <linux/ptrace.h> #include <linux/syscalls.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* diff --git a/kernel/groups.c b/kernel/groups.c index 2fcadd66a8fd..8dd7a61b7115 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -8,7 +8,7 @@ #include <linux/syscalls.h> #include <linux/user_namespace.h> #include <linux/vmalloc.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> struct group_info *groups_alloc(int gidsetsize) { diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 9be9bda7c1f9..4544b115f5eb 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -37,10 +37,10 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) { - int n, nodes; + int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ - for (n = 0, nodes = 0; n < num_online_nodes(); n++) { + for_each_online_node(n) { if (cpumask_intersects(mask, cpumask_of_node(n))) { node_set(n, *nodemsk); nodes++; @@ -82,7 +82,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk); /* - * If the number of nodes in the mask is less than or equal the + * If the number of nodes in the mask is greater than or equal the * number of vectors we just spread the vectors across the nodes. */ if (affv <= nodes) { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 93ad6c1fb9b6..a9b8cf500591 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -182,6 +182,13 @@ void static_key_slow_dec_deferred(struct static_key_deferred *key) } EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); +void static_key_deferred_flush(struct static_key_deferred *key) +{ + STATIC_KEY_CHECK_USE(); + flush_delayed_work(&key->work); +} +EXPORT_SYMBOL_GPL(static_key_deferred_flush); + void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { diff --git a/kernel/kcov.c b/kernel/kcov.c index 3cbb0c879705..85e5546cd791 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -1,11 +1,16 @@ #define pr_fmt(fmt) "kcov: " fmt #define DISABLE_BRANCH_PROFILING +#include <linux/atomic.h> #include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/export.h> #include <linux/types.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/init.h> #include <linux/mm.h> +#include <linux/preempt.h> #include <linux/printk.h> #include <linux/sched.h> #include <linux/slab.h> @@ -14,6 +19,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/kcov.h> +#include <asm/setup.h> /* * kcov descriptor (one per opened debugfs file). @@ -68,6 +74,11 @@ void notrace __sanitizer_cov_trace_pc(void) if (mode == KCOV_MODE_TRACE) { unsigned long *area; unsigned long pos; + unsigned long ip = _RET_IP_; + +#ifdef CONFIG_RANDOMIZE_BASE + ip -= kaslr_offset(); +#endif /* * There is some code that runs in interrupts but for which @@ -81,7 +92,7 @@ void notrace __sanitizer_cov_trace_pc(void) /* The first word is number of subsequent PCs. */ pos = READ_ONCE(area[0]) + 1; if (likely(pos < t->kcov_size)) { - area[pos] = _RET_IP_; + area[pos] = ip; WRITE_ONCE(area[0], pos); } } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 561675589511..5617cc412444 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, while (hole_end <= crashk_res.end) { unsigned long i; + cond_resched(); + if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; /* See if I overlap any of the segments */ @@ -1467,9 +1469,6 @@ static int __init crash_save_vmcoreinfo_init(void) #endif VMCOREINFO_NUMBER(PG_head_mask); VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_X86 - VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); -#endif #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); #endif diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 037c321c5618..b56a558e406d 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -19,6 +19,7 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/fs.h> +#include <linux/ima.h> #include <crypto/hash.h> #include <crypto/sha.h> #include <linux/syscalls.h> @@ -132,6 +133,9 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, return ret; image->kernel_buf_len = size; + /* IMA needs to pass the measurement list to the next kernel. */ + ima_add_kexec_buffer(image); + /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, image->kernel_buf_len); @@ -428,25 +432,65 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } -/* - * Helper function for placing a buffer in a kexec segment. This assumes - * that kexec_mutex is held. +/** + * arch_kexec_walk_mem - call func(data) on free memory regions + * @kbuf: Context info for the search. Also passed to @func. + * @func: Function to call for each memory region. + * + * Return: The memory walk will stop when func returns a non-zero value + * and that value will be returned. If all free regions are visited without + * func returning non-zero, then zero will be returned. + */ +int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, + int (*func)(u64, u64, void *)) +{ + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return walk_iomem_res_desc(crashk_res.desc, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, + kbuf, func); + else + return walk_system_ram_res(0, ULONG_MAX, kbuf, func); +} + +/** + * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel + * @kbuf: Parameters for the memory search. + * + * On success, kbuf->mem will have the start address of the memory region found. + * + * Return: 0 on success, negative errno on error. + */ +int kexec_locate_mem_hole(struct kexec_buf *kbuf) +{ + int ret; + + ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback); + + return ret == 1 ? 0 : -EADDRNOTAVAIL; +} + +/** + * kexec_add_buffer - place a buffer in a kexec segment + * @kbuf: Buffer contents and memory parameters. + * + * This function assumes that kexec_mutex is held. + * On successful return, @kbuf->mem will have the physical address of + * the buffer in memory. + * + * Return: 0 on success, negative errno on error. */ -int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, - unsigned long memsz, unsigned long buf_align, - unsigned long buf_min, unsigned long buf_max, - bool top_down, unsigned long *load_addr) +int kexec_add_buffer(struct kexec_buf *kbuf) { struct kexec_segment *ksegment; - struct kexec_buf buf, *kbuf; int ret; /* Currently adding segment this way is allowed only in file mode */ - if (!image->file_mode) + if (!kbuf->image->file_mode) return -EINVAL; - if (image->nr_segments >= KEXEC_SEGMENT_MAX) + if (kbuf->image->nr_segments >= KEXEC_SEGMENT_MAX) return -EINVAL; /* @@ -456,45 +500,27 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, * logic goes through list of segments to make sure there are * no destination overlaps. */ - if (!list_empty(&image->control_pages)) { + if (!list_empty(&kbuf->image->control_pages)) { WARN_ON(1); return -EINVAL; } - memset(&buf, 0, sizeof(struct kexec_buf)); - kbuf = &buf; - kbuf->image = image; - kbuf->buffer = buffer; - kbuf->bufsz = bufsz; - - kbuf->memsz = ALIGN(memsz, PAGE_SIZE); - kbuf->buf_align = max(buf_align, PAGE_SIZE); - kbuf->buf_min = buf_min; - kbuf->buf_max = buf_max; - kbuf->top_down = top_down; + /* Ensure minimum alignment needed for segments. */ + kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); + kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); /* Walk the RAM ranges and allocate a suitable range for the buffer */ - if (image->type == KEXEC_TYPE_CRASH) - ret = walk_iomem_res_desc(crashk_res.desc, - IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, - crashk_res.start, crashk_res.end, kbuf, - locate_mem_hole_callback); - else - ret = walk_system_ram_res(0, -1, kbuf, - locate_mem_hole_callback); - if (ret != 1) { - /* A suitable memory range could not be found for buffer */ - return -EADDRNOTAVAIL; - } + ret = kexec_locate_mem_hole(kbuf); + if (ret) + return ret; /* Found a suitable memory range */ - ksegment = &image->segment[image->nr_segments]; + ksegment = &kbuf->image->segment[kbuf->image->nr_segments]; ksegment->kbuf = kbuf->buffer; ksegment->bufsz = kbuf->bufsz; ksegment->mem = kbuf->mem; ksegment->memsz = kbuf->memsz; - image->nr_segments++; - *load_addr = ksegment->mem; + kbuf->image->nr_segments++; return 0; } @@ -616,13 +642,15 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, unsigned long max, int top_down) { struct purgatory_info *pi = &image->purgatory_info; - unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; - unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned long align, bss_align, bss_sz, bss_pad; + unsigned long entry, load_addr, curr_load_addr, bss_addr, offset; unsigned char *buf_addr, *src; int i, ret = 0, entry_sidx = -1; const Elf_Shdr *sechdrs_c; Elf_Shdr *sechdrs = NULL; - void *purgatory_buf = NULL; + struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1, + .buf_min = min, .buf_max = max, + .top_down = top_down }; /* * sechdrs_c points to section headers in purgatory and are read @@ -688,9 +716,7 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, } /* Determine how much memory is needed to load relocatable object. */ - buf_align = 1; bss_align = 1; - buf_sz = 0; bss_sz = 0; for (i = 0; i < pi->ehdr->e_shnum; i++) { @@ -699,10 +725,10 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, align = sechdrs[i].sh_addralign; if (sechdrs[i].sh_type != SHT_NOBITS) { - if (buf_align < align) - buf_align = align; - buf_sz = ALIGN(buf_sz, align); - buf_sz += sechdrs[i].sh_size; + if (kbuf.buf_align < align) + kbuf.buf_align = align; + kbuf.bufsz = ALIGN(kbuf.bufsz, align); + kbuf.bufsz += sechdrs[i].sh_size; } else { /* bss section */ if (bss_align < align) @@ -714,32 +740,31 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, /* Determine the bss padding required to align bss properly */ bss_pad = 0; - if (buf_sz & (bss_align - 1)) - bss_pad = bss_align - (buf_sz & (bss_align - 1)); + if (kbuf.bufsz & (bss_align - 1)) + bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1)); - memsz = buf_sz + bss_pad + bss_sz; + kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz; /* Allocate buffer for purgatory */ - purgatory_buf = vzalloc(buf_sz); - if (!purgatory_buf) { + kbuf.buffer = vzalloc(kbuf.bufsz); + if (!kbuf.buffer) { ret = -ENOMEM; goto out; } - if (buf_align < bss_align) - buf_align = bss_align; + if (kbuf.buf_align < bss_align) + kbuf.buf_align = bss_align; /* Add buffer to segment list */ - ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, - buf_align, min, max, top_down, - &pi->purgatory_load_addr); + ret = kexec_add_buffer(&kbuf); if (ret) goto out; + pi->purgatory_load_addr = kbuf.mem; /* Load SHF_ALLOC sections */ - buf_addr = purgatory_buf; + buf_addr = kbuf.buffer; load_addr = curr_load_addr = pi->purgatory_load_addr; - bss_addr = load_addr + buf_sz + bss_pad; + bss_addr = load_addr + kbuf.bufsz + bss_pad; for (i = 0; i < pi->ehdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) @@ -785,11 +810,11 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, * Used later to identify which section is purgatory and skip it * from checksumming. */ - pi->purgatory_buf = purgatory_buf; + pi->purgatory_buf = kbuf.buffer; return ret; out: vfree(sechdrs); - vfree(purgatory_buf); + vfree(kbuf.buffer); return ret; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 0a52315d9c62..4cef7e4706b0 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -20,22 +20,6 @@ struct kexec_sha_region { unsigned long len; }; -/* - * Keeps track of buffer parameters as provided by caller for requesting - * memory placement of buffer. - */ -struct kexec_buf { - struct kimage *image; - char *buffer; - unsigned long bufsz; - unsigned long mem; - unsigned long memsz; - unsigned long buf_align; - unsigned long buf_min; - unsigned long buf_max; - bool top_down; /* allocate from top of memory hole */ -}; - void kimage_file_post_load_cleanup(struct kimage *image); #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } diff --git a/kernel/kmod.c b/kernel/kmod.c index 0277d1216f80..d45c96073afb 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -39,7 +39,7 @@ #include <linux/rwsem.h> #include <linux/ptrace.h> #include <linux/async.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <trace/events/module.h> diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d63095472ea9..43460104f119 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -52,7 +52,7 @@ #include <asm/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7bd265f6b098..7c38f8f3d97b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3191,7 +3191,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; } -static int __lock_is_held(struct lockdep_map *lock); +static int __lock_is_held(struct lockdep_map *lock, int read); /* * This gets called for every mutex_lock*()/spin_lock*() operation. @@ -3332,7 +3332,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } chain_key = iterate_chain_key(chain_key, class_idx); - if (nest_lock && !__lock_is_held(nest_lock)) + if (nest_lock && !__lock_is_held(nest_lock, -1)) return print_lock_nested_lock_not_held(curr, hlock, ip); if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) @@ -3579,7 +3579,7 @@ found_it: return 1; } -static int __lock_is_held(struct lockdep_map *lock) +static int __lock_is_held(struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3587,8 +3587,12 @@ static int __lock_is_held(struct lockdep_map *lock) for (i = 0; i < curr->lockdep_depth; i++) { struct held_lock *hlock = curr->held_locks + i; - if (match_held_lock(hlock, lock)) - return 1; + if (match_held_lock(hlock, lock)) { + if (read == -1 || hlock->read == read) + return 1; + + return 0; + } } return 0; @@ -3772,7 +3776,7 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held(struct lockdep_map *lock) +int lock_is_held_type(struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; @@ -3784,13 +3788,13 @@ int lock_is_held(struct lockdep_map *lock) check_flags(flags); current->lockdep_recursion = 1; - ret = __lock_is_held(lock); + ret = __lock_is_held(lock, read); current->lockdep_recursion = 0; raw_local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(lock_is_held); +EXPORT_SYMBOL_GPL(lock_is_held_type); struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index a0f61effad25..6d1fcc786081 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -18,7 +18,7 @@ #include <linux/debug_locks.h> #include <linux/vmalloc.h> #include <linux/sort.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/div64.h> #include "lockdep_internals.h" diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index eb0a599fcf58..e852be4851fc 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -108,11 +108,7 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf, /* * Get the counter ID stored in file->f_inode->i_private */ - if (!file->f_inode) { - WARN_ON_ONCE(1); - return -EBADF; - } - counter = (long)(file->f_inode->i_private); + counter = (long)file_inode(file)->i_private; if (counter >= qstat_num) return -EBADF; @@ -177,11 +173,7 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf, /* * Get the counter ID stored in file->f_inode->i_private */ - if (!file->f_inode) { - WARN_ON_ONCE(1); - return -EBADF; - } - if ((long)(file->f_inode->i_private) != qstat_reset_cnts) + if ((long)file_inode(file)->i_private != qstat_reset_cnts) return count; for_each_possible_cpu(cpu) { diff --git a/kernel/memremap.c b/kernel/memremap.c index b501e390bb34..9ecedc28b928 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -246,7 +246,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data) /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); + mem_hotplug_begin(); arch_remove_memory(align_start, align_size); + mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, @@ -358,7 +360,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (error) goto err_pfn_remap; + mem_hotplug_begin(); error = arch_add_memory(nid, align_start, align_size, true); + mem_hotplug_done(); if (error) goto err_add_memory; diff --git a/kernel/module.c b/kernel/module.c index 0e54d5bf0097..38d4270925d4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -46,7 +46,7 @@ #include <linux/string.h> #include <linux/mutex.h> #include <linux/rculist.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <linux/license.h> @@ -313,8 +313,11 @@ struct load_info { } index; }; -/* We require a truly strong try_module_get(): 0 means failure due to - ongoing or failed initialization etc. */ +/* + * We require a truly strong try_module_get(): 0 means success. + * Otherwise an error is returned due to ongoing or failed + * initialization etc. + */ static inline int strong_try_module_get(struct module *mod) { BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED); @@ -330,7 +333,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag, enum lockdep_ok lockdep_ok) { add_taint(flag, lockdep_ok); - mod->taints |= (1U << flag); + set_bit(flag, &mod->taints); } /* @@ -1138,24 +1141,13 @@ static inline int module_unload_init(struct module *mod) static size_t module_flags_taint(struct module *mod, char *buf) { size_t l = 0; + int i; + + for (i = 0; i < TAINT_FLAGS_COUNT; i++) { + if (taint_flags[i].module && test_bit(i, &mod->taints)) + buf[l++] = taint_flags[i].c_true; + } - if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) - buf[l++] = 'P'; - if (mod->taints & (1 << TAINT_OOT_MODULE)) - buf[l++] = 'O'; - if (mod->taints & (1 << TAINT_FORCED_MODULE)) - buf[l++] = 'F'; - if (mod->taints & (1 << TAINT_CRAP)) - buf[l++] = 'C'; - if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) - buf[l++] = 'E'; - if (mod->taints & (1 << TAINT_LIVEPATCH)) - buf[l++] = 'K'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ return l; } @@ -1911,6 +1903,9 @@ static void frob_writable_data(const struct module_layout *layout, /* livepatching wants to disable read-only so it can frob module. */ void module_disable_ro(const struct module *mod) { + if (!rodata_enabled) + return; + frob_text(&mod->core_layout, set_memory_rw); frob_rodata(&mod->core_layout, set_memory_rw); frob_ro_after_init(&mod->core_layout, set_memory_rw); @@ -1920,6 +1915,9 @@ void module_disable_ro(const struct module *mod) void module_enable_ro(const struct module *mod, bool after_init) { + if (!rodata_enabled) + return; + frob_text(&mod->core_layout, set_memory_ro); frob_rodata(&mod->core_layout, set_memory_ro); frob_text(&mod->init_layout, set_memory_ro); @@ -1952,6 +1950,9 @@ void set_all_modules_text_rw(void) { struct module *mod; + if (!rodata_enabled) + return; + mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) @@ -1968,9 +1969,18 @@ void set_all_modules_text_ro(void) { struct module *mod; + if (!rodata_enabled) + return; + mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) + /* + * Ignore going modules since it's possible that ro + * protection has already been disabled, otherwise we'll + * run into protection faults at module deallocation. + */ + if (mod->state == MODULE_STATE_UNFORMED || + mod->state == MODULE_STATE_GOING) continue; frob_text(&mod->core_layout, set_memory_ro); @@ -1981,10 +1991,12 @@ void set_all_modules_text_ro(void) static void disable_ro_nx(const struct module_layout *layout) { - frob_text(layout, set_memory_rw); - frob_rodata(layout, set_memory_rw); + if (rodata_enabled) { + frob_text(layout, set_memory_rw); + frob_rodata(layout, set_memory_rw); + frob_ro_after_init(layout, set_memory_rw); + } frob_rodata(layout, set_memory_x); - frob_ro_after_init(layout, set_memory_rw); frob_ro_after_init(layout, set_memory_x); frob_writable_data(layout, set_memory_x); } @@ -3709,6 +3721,7 @@ static int load_module(struct load_info *info, const char __user *uargs, sysfs_cleanup: mod_sysfs_teardown(mod); coming_cleanup: + mod->state = MODULE_STATE_GOING; blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); klp_module_going(mod); @@ -4042,6 +4055,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, } #endif /* CONFIG_KALLSYMS */ +/* Maximum number of characters written by module_flags() */ +#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) + +/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ static char *module_flags(struct module *mod, char *buf) { int bx = 0; @@ -4086,7 +4103,7 @@ static void m_stop(struct seq_file *m, void *p) static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); - char buf[8]; + char buf[MODULE_FLAGS_BUF_SIZE]; /* We always ignore unformed modules. */ if (mod->state == MODULE_STATE_UNFORMED) @@ -4257,7 +4274,7 @@ EXPORT_SYMBOL_GPL(__module_text_address); void print_modules(void) { struct module *mod; - char buf[8]; + char buf[MODULE_FLAGS_BUF_SIZE]; printk(KERN_DEFAULT "Modules linked in:"); /* Most callers should already have preempt disabled, but make sure */ diff --git a/kernel/padata.c b/kernel/padata.c index 7848f0566403..05316c9f32da 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -64,15 +64,11 @@ static int padata_cpu_hash(struct parallel_data *pd) static void padata_parallel_worker(struct work_struct *parallel_work) { struct padata_parallel_queue *pqueue; - struct parallel_data *pd; - struct padata_instance *pinst; LIST_HEAD(local_list); local_bh_disable(); pqueue = container_of(parallel_work, struct padata_parallel_queue, work); - pd = pqueue->pd; - pinst = pd->pinst; spin_lock(&pqueue->parallel.lock); list_replace_init(&pqueue->parallel.list, &local_list); diff --git a/kernel/panic.c b/kernel/panic.c index e6480e20379e..901c4fb46002 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -298,30 +298,27 @@ void panic(const char *fmt, ...) EXPORT_SYMBOL(panic); - -struct tnt { - u8 bit; - char true; - char false; -}; - -static const struct tnt tnts[] = { - { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, - { TAINT_FORCED_MODULE, 'F', ' ' }, - { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' }, - { TAINT_FORCED_RMMOD, 'R', ' ' }, - { TAINT_MACHINE_CHECK, 'M', ' ' }, - { TAINT_BAD_PAGE, 'B', ' ' }, - { TAINT_USER, 'U', ' ' }, - { TAINT_DIE, 'D', ' ' }, - { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, - { TAINT_WARN, 'W', ' ' }, - { TAINT_CRAP, 'C', ' ' }, - { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, - { TAINT_OOT_MODULE, 'O', ' ' }, - { TAINT_UNSIGNED_MODULE, 'E', ' ' }, - { TAINT_SOFTLOCKUP, 'L', ' ' }, - { TAINT_LIVEPATCH, 'K', ' ' }, +/* + * TAINT_FORCED_RMMOD could be a per-module flag but the module + * is being removed anyway. + */ +const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { + { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */ + { 'F', ' ', true }, /* TAINT_FORCED_MODULE */ + { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */ + { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */ + { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */ + { 'B', ' ', false }, /* TAINT_BAD_PAGE */ + { 'U', ' ', false }, /* TAINT_USER */ + { 'D', ' ', false }, /* TAINT_DIE */ + { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */ + { 'W', ' ', false }, /* TAINT_WARN */ + { 'C', ' ', true }, /* TAINT_CRAP */ + { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */ + { 'O', ' ', true }, /* TAINT_OOT_MODULE */ + { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ + { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ + { 'K', ' ', true }, /* TAINT_LIVEPATCH */ }; /** @@ -348,17 +345,17 @@ static const struct tnt tnts[] = { */ const char *print_tainted(void) { - static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")]; + static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")]; if (tainted_mask) { char *s; int i; s = buf + sprintf(buf, "Tainted: "); - for (i = 0; i < ARRAY_SIZE(tnts); i++) { - const struct tnt *t = &tnts[i]; - *s++ = test_bit(t->bit, &tainted_mask) ? - t->true : t->false; + for (i = 0; i < TAINT_FLAGS_COUNT; i++) { + const struct taint_flag *t = &taint_flags[i]; + *s++ = test_bit(i, &tainted_mask) ? + t->c_true : t->c_false; } *s = 0; } else diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index df9e8e9e0be7..eef2ce968636 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -151,8 +151,12 @@ out: static void delayed_free_pidns(struct rcu_head *p) { - kmem_cache_free(pid_ns_cachep, - container_of(p, struct pid_namespace, rcu)); + struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu); + + dec_pid_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + + kmem_cache_free(pid_ns_cachep, ns); } static void destroy_pid_namespace(struct pid_namespace *ns) @@ -162,8 +166,6 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_free_inum(&ns->ns); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); - dec_pid_namespaces(ns->ucounts); - put_user_ns(ns->user_ns); call_rcu(&ns->rcu, delayed_free_pidns); } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4f0f0604f1c4..2d8e2b227db8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -30,7 +30,7 @@ #include <linux/compiler.h> #include <linux/ktime.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> diff --git a/kernel/power/user.c b/kernel/power/user.c index 35310b627388..22df9f7ff672 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -25,7 +25,7 @@ #include <linux/cpu.h> #include <linux/freezer.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "power.h" diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 577f2288d19f..8b2696420abb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -46,7 +46,7 @@ #include <linux/ctype.h> #include <linux/uio.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/sections.h> #define CREATE_TRACE_POINTS @@ -356,7 +356,6 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; -static enum log_flags syslog_prev; static size_t syslog_partial; /* index and sequence number of the first record stored in the buffer */ @@ -370,7 +369,6 @@ static u32 log_next_idx; /* the next printk record to write to the console */ static u64 console_seq; static u32 console_idx; -static enum log_flags console_prev; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; @@ -639,27 +637,15 @@ static void append_char(char **pp, char *e, char c) } static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, u64 seq, - enum log_flags prev_flags) + struct printk_log *msg, u64 seq) { u64 ts_usec = msg->ts_nsec; - char cont = '-'; do_div(ts_usec, 1000); - /* - * If we couldn't merge continuation line fragments during the print, - * export the stored flags to allow an optional external merge of the - * records. Merging the records isn't always neccessarily correct, like - * when we hit a race during printing. In most cases though, it produces - * better readable output. 'c' in the record flags mark the first - * fragment of a line, '+' the following. - */ - if (msg->flags & LOG_CONT) - cont = (prev_flags & LOG_CONT) ? '+' : 'c'; - return scnprintf(buf, size, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, seq, ts_usec, cont); + (msg->facility << 3) | msg->level, seq, ts_usec, + msg->flags & LOG_CONT ? 'c' : '-'); } static ssize_t msg_print_ext_body(char *buf, size_t size, @@ -714,7 +700,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, struct devkmsg_user { u64 seq; u32 idx; - enum log_flags prev; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; @@ -748,7 +733,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) return -ENOMEM; buf[len] = '\0'; - if (copy_from_iter(buf, len, from) != len) { + if (!copy_from_iter_full(buf, len, from)) { kfree(buf); return -EFAULT; } @@ -824,12 +809,11 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, msg = log_from_idx(user->idx); len = msg_print_ext_header(user->buf, sizeof(user->buf), - msg, user->seq, user->prev); + msg, user->seq); len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, log_dict(msg), msg->dict_len, log_text(msg), msg->text_len); - user->prev = msg->flags; user->idx = log_next(user->idx); user->seq++; raw_spin_unlock_irq(&logbuf_lock); @@ -1210,26 +1194,12 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf) return len; } -static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, - bool syslog, char *buf, size_t size) +static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { const char *text = log_text(msg); size_t text_size = msg->text_len; - bool prefix = true; - bool newline = true; size_t len = 0; - if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) - prefix = false; - - if (msg->flags & LOG_CONT) { - if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) - prefix = false; - - if (!(msg->flags & LOG_NEWLINE)) - newline = false; - } - do { const char *next = memchr(text, '\n', text_size); size_t text_len; @@ -1247,22 +1217,17 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, text_len + 1 >= size - len) break; - if (prefix) - len += print_prefix(msg, syslog, buf + len); + len += print_prefix(msg, syslog, buf + len); memcpy(buf + len, text, text_len); len += text_len; - if (next || newline) - buf[len++] = '\n'; + buf[len++] = '\n'; } else { /* SYSLOG_ACTION_* buffer size only calculation */ - if (prefix) - len += print_prefix(msg, syslog, NULL); + len += print_prefix(msg, syslog, NULL); len += text_len; - if (next || newline) - len++; + len++; } - prefix = true; text = next; } while (text); @@ -1288,7 +1253,6 @@ static int syslog_print(char __user *buf, int size) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; - syslog_prev = 0; syslog_partial = 0; } if (syslog_seq == log_next_seq) { @@ -1298,13 +1262,11 @@ static int syslog_print(char __user *buf, int size) skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, syslog_prev, true, text, - LOG_LINE_MAX + PREFIX_MAX); + n = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); syslog_seq++; - syslog_prev = msg->flags; n -= syslog_partial; syslog_partial = 0; } else if (!len){ @@ -1347,7 +1309,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 next_seq; u64 seq; u32 idx; - enum log_flags prev; /* * Find first record that fits, including all following records, @@ -1355,12 +1316,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear) */ seq = clear_seq; idx = clear_idx; - prev = 0; while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - len += msg_print_text(msg, prev, true, NULL, 0); - prev = msg->flags; + len += msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; } @@ -1368,12 +1327,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* move first record forward until length fits into the buffer */ seq = clear_seq; idx = clear_idx; - prev = 0; while (len > size && seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - len -= msg_print_text(msg, prev, true, NULL, 0); - prev = msg->flags; + len -= msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; } @@ -1386,7 +1343,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct printk_log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, prev, true, text, + textlen = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX); if (textlen < 0) { len = textlen; @@ -1394,7 +1351,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) } idx = log_next(idx); seq++; - prev = msg->flags; raw_spin_unlock_irq(&logbuf_lock); if (copy_to_user(buf + len, text, textlen)) @@ -1407,7 +1363,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* messages are gone, move to next one */ seq = log_first_seq; idx = log_first_idx; - prev = 0; } } } @@ -1508,7 +1463,6 @@ int do_syslog(int type, char __user *buf, int len, int source) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; - syslog_prev = 0; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { @@ -1521,16 +1475,14 @@ int do_syslog(int type, char __user *buf, int len, int source) } else { u64 seq = syslog_seq; u32 idx = syslog_idx; - enum log_flags prev = syslog_prev; error = 0; while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - error += msg_print_text(msg, prev, true, NULL, 0); + error += msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; - prev = msg->flags; } error -= syslog_partial; } @@ -1631,46 +1583,25 @@ static inline void printk_delay(void) static struct cont { char buf[LOG_LINE_MAX]; size_t len; /* length == 0 means unused buffer */ - size_t cons; /* bytes written to console */ struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ - bool flushed:1; /* buffer sealed and committed */ } cont; static void cont_flush(void) { - if (cont.flushed) - return; if (cont.len == 0) return; - if (cont.cons) { - /* - * If a fragment of this line was directly flushed to the - * console; wait for the console to pick up the rest of the - * line. LOG_NOCONS suppresses a duplicated output. - */ - log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS, - cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.flushed = true; - } else { - /* - * If no fragment of this line ever reached the console, - * just submit it to the store and free the buffer. - */ - log_store(cont.facility, cont.level, cont.flags, 0, - NULL, 0, cont.buf, cont.len); - cont.len = 0; - } + + log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec, + NULL, 0, cont.buf, cont.len); + cont.len = 0; } static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) { - if (cont.len && cont.flushed) - return false; - /* * If ext consoles are present, flush and skip in-kernel * continuation. See nr_ext_console_drivers definition. Also, if @@ -1687,8 +1618,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * cont.owner = current; cont.ts_nsec = local_clock(); cont.flags = flags; - cont.cons = 0; - cont.flushed = false; } memcpy(cont.buf + cont.len, text, len); @@ -1707,34 +1636,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * return true; } -static size_t cont_print_text(char *text, size_t size) -{ - size_t textlen = 0; - size_t len; - - if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { - textlen += print_time(cont.ts_nsec, text); - size -= textlen; - } - - len = cont.len - cont.cons; - if (len > 0) { - if (len+1 > size) - len = size-1; - memcpy(text + textlen, cont.buf + cont.cons, len); - textlen += len; - cont.cons = cont.len; - } - - if (cont.flushed) { - if (cont.flags & LOG_NEWLINE) - text[textlen++] = '\n'; - /* got everything, release buffer */ - cont.len = 0; - } - return textlen; -} - static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) { /* @@ -1926,7 +1827,8 @@ int vprintk_default(const char *fmt, va_list args) int r; #ifdef CONFIG_KGDB_KDB - if (unlikely(kdb_trap_printk)) { + /* Allow to pass printk() to kdb but avoid a recursion. */ + if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) { r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); return r; } @@ -1980,33 +1882,24 @@ static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; static u32 console_idx; -static enum log_flags syslog_prev; static u64 log_first_seq; static u32 log_first_idx; static u64 log_next_seq; -static enum log_flags console_prev; -static struct cont { - size_t len; - size_t cons; - u8 level; - bool flushed:1; -} cont; static char *log_text(const struct printk_log *msg) { return NULL; } static char *log_dict(const struct printk_log *msg) { return NULL; } static struct printk_log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, u64 seq, - enum log_flags prev_flags) { return 0; } + struct printk_log *msg, + u64 seq) { return 0; } static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } static void call_console_drivers(int level, const char *ext_text, size_t ext_len, const char *text, size_t len) {} -static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, +static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } -static size_t cont_print_text(char *text, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } /* Still needs to be defined for users */ @@ -2270,42 +2163,6 @@ static inline int can_use_console(void) return cpu_online(raw_smp_processor_id()) || have_callable_console(); } -static void console_cont_flush(char *text, size_t size) -{ - unsigned long flags; - size_t len; - - raw_spin_lock_irqsave(&logbuf_lock, flags); - - if (!cont.len) - goto out; - - if (suppress_message_printing(cont.level)) { - cont.cons = cont.len; - if (cont.flushed) - cont.len = 0; - goto out; - } - - /* - * We still queue earlier records, likely because the console was - * busy. The earlier ones need to be printed before this one, we - * did not flush any fragment so far, so just let it queue up. - */ - if (console_seq < log_next_seq && !cont.cons) - goto out; - - len = cont_print_text(text, size); - raw_spin_unlock(&logbuf_lock); - stop_critical_timings(); - call_console_drivers(cont.level, NULL, 0, text, len); - start_critical_timings(); - local_irq_restore(flags); - return; -out: - raw_spin_unlock_irqrestore(&logbuf_lock, flags); -} - /** * console_unlock - unlock the console system * @@ -2359,9 +2216,6 @@ again: return; } - /* flush buffered message fragment immediately to console */ - console_cont_flush(text, sizeof(text)); - for (;;) { struct printk_log *msg; size_t ext_len = 0; @@ -2381,7 +2235,6 @@ again: /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; - console_prev = 0; } else { len = 0; } @@ -2391,8 +2244,7 @@ skip: msg = log_from_idx(console_idx); level = msg->level; - if ((msg->flags & LOG_NOCONS) || - suppress_message_printing(level)) { + if (suppress_message_printing(level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and @@ -2400,22 +2252,14 @@ skip: */ console_idx = log_next(console_idx); console_seq++; - /* - * We will get here again when we register a new - * CON_PRINTBUFFER console. Clear the flag so we - * will properly dump everything later. - */ - msg->flags &= ~LOG_NOCONS; - console_prev = msg->flags; goto skip; } - len += msg_print_text(msg, console_prev, false, - text + len, sizeof(text) - len); + len += msg_print_text(msg, false, text + len, sizeof(text) - len); if (nr_ext_console_drivers) { ext_len = msg_print_ext_header(ext_text, sizeof(ext_text), - msg, console_seq, console_prev); + msg, console_seq); ext_len += msg_print_ext_body(ext_text + ext_len, sizeof(ext_text) - ext_len, log_dict(msg), msg->dict_len, @@ -2423,7 +2267,6 @@ skip: } console_idx = log_next(console_idx); console_seq++; - console_prev = msg->flags; raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ @@ -2718,7 +2561,6 @@ void register_console(struct console *newcon) raw_spin_lock_irqsave(&logbuf_lock, flags); console_seq = syslog_seq; console_idx = syslog_idx; - console_prev = syslog_prev; raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the @@ -3074,7 +2916,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, goto out; msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, 0, syslog, line, size); + l = msg_print_text(msg, syslog, line, size); dumper->cur_idx = log_next(dumper->cur_idx); dumper->cur_seq++; @@ -3143,7 +2985,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, u32 idx; u64 next_seq; u32 next_idx; - enum log_flags prev; size_t l = 0; bool ret = false; @@ -3166,27 +3007,23 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* calculate length of entire buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; - prev = 0; while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l += msg_print_text(msg, prev, true, NULL, 0); + l += msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; - prev = msg->flags; } /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; - prev = 0; while (l > size && seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l -= msg_print_text(msg, prev, true, NULL, 0); + l -= msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; - prev = msg->flags; } /* last message in next interation */ @@ -3197,10 +3034,9 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l += msg_print_text(msg, prev, syslog, buf + l, size - l); + l += msg_print_text(msg, syslog, buf + l, size - l); idx = log_next(idx); seq++; - prev = msg->flags; } dumper->next_seq = next_seq; diff --git a/kernel/profile.c b/kernel/profile.c index 2dbccf2d806c..f67ce0aa6bc4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -408,7 +408,7 @@ void profile_tick(int type) #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) { diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e6474f7272ec..49ba7c1ade9d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -27,6 +27,35 @@ #include <linux/cn_proc.h> #include <linux/compat.h> +/* + * Access another process' address space via ptrace. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + if (!tsk->ptrace || + (current != tsk->parent) || + ((get_dumpable(mm) != SUID_DUMP_USER) && + !ptracer_capable(tsk, mm->user_ns))) { + mmput(mm); + return 0; + } + + ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + mmput(mm); + + return ret; +} + /* * ptrace a task: make the debugger its new parent and @@ -39,6 +68,9 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) BUG_ON(!list_empty(&child->ptrace_entry)); list_add(&child->ptrace_entry, &new_parent->ptraced); child->parent = new_parent; + rcu_read_lock(); + child->ptracer_cred = get_cred(__task_cred(new_parent)); + rcu_read_unlock(); } /** @@ -71,12 +103,16 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) */ void __ptrace_unlink(struct task_struct *child) { + const struct cred *old_cred; BUG_ON(!child->ptrace); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); child->parent = child->real_parent; list_del_init(&child->ptrace_entry); + old_cred = child->ptracer_cred; + child->ptracer_cred = NULL; + put_cred(old_cred); spin_lock(&child->sighand->siglock); child->ptrace = 0; @@ -220,7 +256,7 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; - int dumpable = 0; + struct mm_struct *mm; kuid_t caller_uid; kgid_t caller_gid; @@ -271,16 +307,11 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) return -EPERM; ok: rcu_read_unlock(); - smp_rmb(); - if (task->mm) - dumpable = get_dumpable(task->mm); - rcu_read_lock(); - if (dumpable != SUID_DUMP_USER && - !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { - rcu_read_unlock(); - return -EPERM; - } - rcu_read_unlock(); + mm = task->mm; + if (mm && + ((get_dumpable(mm) != SUID_DUMP_USER) && + !ptrace_has_cap(mm->user_ns, mode))) + return -EPERM; return security_ptrace_access_check(task, mode); } @@ -344,10 +375,6 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) flags |= PT_SEIZED; - rcu_read_lock(); - if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) - flags |= PT_PTRACE_CAP; - rcu_read_unlock(); task->ptrace = flags; __ptrace_link(task, current); @@ -537,7 +564,8 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); + retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE); + if (!retval) { if (copied) break; @@ -564,7 +592,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, + retval = ptrace_access_vm(tsk, dst, buf, this_len, FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) @@ -1128,7 +1156,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); + copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1139,7 +1167,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), + copied = ptrace_access_vm(tsk, addr, &data, sizeof(data), FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1157,7 +1185,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), + ret = ptrace_access_vm(child, addr, &word, sizeof(word), FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; @@ -1167,7 +1195,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), + ret = ptrace_access_vm(child, addr, &data, sizeof(data), FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 80adef7d4c3d..0d6ff3e471be 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void); #define TPS(x) tracepoint_string(x) void rcu_early_boot_tests(void); +void rcu_test_sync_prims(void); /* * This function really isn't for public consumption, but RCU is special in diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 1898559e6b60..b23a4d076f3d 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -185,9 +185,6 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused * benefits of doing might_sleep() to reduce latency.) * * Cool, huh? (Due to Josh Triplett.) - * - * But we want to make this a static inline later. The cond_resched() - * currently makes this problematic. */ void synchronize_sched(void) { @@ -195,7 +192,6 @@ void synchronize_sched(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_sched() in RCU read-side critical section"); - cond_resched(); } EXPORT_SYMBOL_GPL(synchronize_sched); diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 196f0302e2f4..c64b827ecbca 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); /* * During boot, we forgive RCU lockdep issues. After this function is - * invoked, we start taking RCU lockdep issues seriously. + * invoked, we start taking RCU lockdep issues seriously. Note that unlike + * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE + * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. + * The reason for this is that Tiny RCU does not need kthreads, so does + * not have to care about the fact that the scheduler is half-initialized + * at a certain phase of the boot process. */ void __init rcu_scheduler_starting(void) { WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 96c52e43f7ca..cb4e2056ccf3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ int sysctl_panic_on_rcu_stall __read_mostly; /* - * The rcu_scheduler_active variable transitions from zero to one just - * before the first task is spawned. So when this variable is zero, RCU - * can assume that there is but one task, allowing RCU to (for example) + * The rcu_scheduler_active variable is initialized to the value + * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the + * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE, + * RCU can assume that there is but one task, allowing RCU to (for example) * optimize synchronize_rcu() to a simple barrier(). When this variable - * is one, RCU must actually do all the hard work required to detect real - * grace periods. This variable is also used to suppress boot-time false - * positives from lockdep-RCU error checking. + * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required + * to detect real grace periods. This variable is also used to suppress + * boot-time false positives from lockdep-RCU error checking. Finally, it + * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU + * is fully initialized, including all of its kthreads having been spawned. */ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); @@ -3980,18 +3983,22 @@ static int __init rcu_spawn_gp_kthread(void) early_initcall(rcu_spawn_gp_kthread); /* - * This function is invoked towards the end of the scheduler's initialization - * process. Before this is called, the idle task might contain - * RCU read-side critical sections (during which time, this idle - * task is booting the system). After this function is called, the - * idle tasks are prohibited from containing RCU read-side critical - * sections. This function also enables RCU lockdep checking. + * This function is invoked towards the end of the scheduler's + * initialization process. Before this is called, the idle task might + * contain synchronous grace-period primitives (during which time, this idle + * task is booting the system, and such primitives are no-ops). After this + * function is called, any synchronous grace-period primitives are run as + * expedited, with the requesting task driving the grace period forward. + * A later core_initcall() rcu_exp_runtime_mode() will switch to full + * runtime RCU functionality. */ void rcu_scheduler_starting(void) { WARN_ON(num_online_cpus() != 1); WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; + rcu_test_sync_prims(); + rcu_scheduler_active = RCU_SCHEDULER_INIT; + rcu_test_sync_prims(); } /* diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d3053e99fdb6..e59e1849b89a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -532,18 +532,28 @@ struct rcu_exp_work { }; /* + * Common code to drive an expedited grace period forward, used by + * workqueues and mid-boot-time tasks. + */ +static void rcu_exp_sel_wait_wake(struct rcu_state *rsp, + smp_call_func_t func, unsigned long s) +{ + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, func); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); +} + +/* * Work-queue handler to drive an expedited grace period forward. */ static void wait_rcu_exp_gp(struct work_struct *wp) { struct rcu_exp_work *rewp; - /* Initialize the rcu_node tree in preparation for the wait. */ rewp = container_of(wp, struct rcu_exp_work, rew_work); - sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s); + rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s); } /* @@ -569,12 +579,18 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - /* Marshall arguments and schedule the expedited grace period. */ - rew.rew_func = func; - rew.rew_rsp = rsp; - rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - schedule_work(&rew.rew_work); + /* Ensure that load happens before action based on it. */ + if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { + /* Direct call during scheduler init and early_initcalls(). */ + rcu_exp_sel_wait_wake(rsp, func, s); + } else { + /* Marshall arguments & schedule the expedited grace period. */ + rew.rew_func = func; + rew.rew_rsp = rsp; + rew.rew_s = s; + INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); + schedule_work(&rew.rew_work); + } /* Wait for expedited grace period to complete. */ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); @@ -676,6 +692,8 @@ void synchronize_rcu_expedited(void) { struct rcu_state *rsp = rcu_state_p; + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return; _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); @@ -693,3 +711,15 @@ void synchronize_rcu_expedited(void) EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + +/* + * Switch to run-time mode once Tree RCU has fully initialized. + */ +static int __init rcu_exp_runtime_mode(void) +{ + rcu_test_sync_prims(); + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + rcu_test_sync_prims(); + return 0; +} +core_initcall(rcu_exp_runtime_mode); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 85c5a883c6e3..56583e764ebf 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -670,7 +670,7 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); - if (!rcu_scheduler_active) + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f19271dce0a9..4f6db7e6a117 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -121,11 +121,14 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); * Should expedited grace-period primitives always fall back to their * non-expedited counterparts? Intended for use within RCU. Note * that if the user specifies both rcu_expedited and rcu_normal, then - * rcu_normal wins. + * rcu_normal wins. (Except during the time period during boot from + * when the first task is spawned until the rcu_exp_runtime_mode() + * core_initcall() is invoked, at which point everything is expedited.) */ bool rcu_gp_is_normal(void) { - return READ_ONCE(rcu_normal); + return READ_ONCE(rcu_normal) && + rcu_scheduler_active != RCU_SCHEDULER_INIT; } EXPORT_SYMBOL_GPL(rcu_gp_is_normal); @@ -135,13 +138,14 @@ static atomic_t rcu_expedited_nesting = /* * Should normal grace-period primitives be expedited? Intended for * use within RCU. Note that this function takes the rcu_expedited - * sysfs/boot variable into account as well as the rcu_expedite_gp() - * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited() - * returns false is a -really- bad idea. + * sysfs/boot variable and rcu_scheduler_active into account as well + * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp() + * until rcu_gp_is_expedited() returns false is a -really- bad idea. */ bool rcu_gp_is_expedited(void) { - return rcu_expedited || atomic_read(&rcu_expedited_nesting); + return rcu_expedited || atomic_read(&rcu_expedited_nesting) || + rcu_scheduler_active == RCU_SCHEDULER_INIT; } EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); @@ -257,7 +261,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map); int notrace debug_lockdep_rcu_enabled(void) { - return rcu_scheduler_active && debug_locks && + return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); @@ -591,7 +595,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); void synchronize_rcu_tasks(void) { /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(!rcu_scheduler_active, + RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ @@ -813,6 +817,23 @@ static void rcu_spawn_tasks_kthread(void) #endif /* #ifdef CONFIG_TASKS_RCU */ +/* + * Test each non-SRCU synchronous grace-period wait API. This is + * useful just after a change in mode for these primitives, and + * during early boot. + */ +void rcu_test_sync_prims(void) +{ + if (!IS_ENABLED(CONFIG_PROVE_RCU)) + return; + synchronize_rcu(); + synchronize_rcu_bh(); + synchronize_sched(); + synchronize_rcu_expedited(); + synchronize_rcu_bh_expedited(); + synchronize_sched_expedited(); +} + #ifdef CONFIG_PROVE_RCU /* @@ -865,6 +886,7 @@ void rcu_early_boot_tests(void) early_boot_test_call_rcu_bh(); if (rcu_self_test_sched) early_boot_test_call_rcu_sched(); + rcu_test_sync_prims(); } static int rcu_verify_early_boot_tests(void) diff --git a/kernel/relay.c b/kernel/relay.c index da79a109dbeb..8f18d314a96a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -809,11 +809,11 @@ void relay_subbufs_consumed(struct rchan *chan, { struct rchan_buf *buf; - if (!chan) + if (!chan || cpu >= NR_CPUS) return; buf = *per_cpu_ptr(chan->buf, cpu); - if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) + if (!buf || subbufs_consumed > chan->n_subbufs) return; if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 966556ebdbb3..c56fb57f2991 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1456,7 +1456,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * yield - it could be a while. */ if (unlikely(queued)) { - ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + ktime_t to = NSEC_PER_SEC / HZ; set_current_state(TASK_UNINTERRUPTIBLE); schedule_hrtimeout(&to, HRTIMER_MODE_REL); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index bff9c774987a..f7ce79a46050 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -41,8 +41,7 @@ * outside of a lifetime-guarded section. In general, this * is only needed for handling filters shared across tasks. * @prev: points to a previously installed, or inherited, filter - * @len: the number of instructions in the program - * @insnsi: the BPF program instructions to evaluate + * @prog: the BPF program to evaluate * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -168,8 +167,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) } /** - * seccomp_run_filters - evaluates all seccomp filters against @syscall - * @syscall: number of the current system call + * seccomp_run_filters - evaluates all seccomp filters against @sd + * @sd: optional seccomp data to be passed to filters * * Returns valid seccomp BPF response codes. */ diff --git a/kernel/signal.c b/kernel/signal.c index 29a410780aa9..3603d93a1968 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -39,7 +39,7 @@ #include <trace/events/signal.h> #include <asm/param.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/siginfo.h> #include <asm/cacheflush.h> @@ -346,7 +346,7 @@ static bool task_participate_group_stop(struct task_struct *task) * fresh group stop. Read comment in do_signal_stop() for details. */ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { - sig->flags = SIGNAL_STOP_STOPPED; + signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED); return true; } return false; @@ -587,7 +587,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) struct hrtimer *tmr = &tsk->signal->real_timer; if (!hrtimer_is_queued(tmr) && - tsk->signal->it_real_incr.tv64 != 0) { + tsk->signal->it_real_incr != 0) { hrtimer_forward(tmr, tmr->base->get_time(), tsk->signal->it_real_incr); hrtimer_restart(tmr); @@ -843,7 +843,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ - signal->flags = why | SIGNAL_STOP_CONTINUED; + signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; signal->group_exit_code = 0; } @@ -2491,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset) { struct task_struct *tsk = current; + /* + * In case the signal mask hasn't changed, there is nothing we need + * to do. The current->blocked shouldn't be modified by other task. + */ + if (sigequalsets(&tsk->blocked, newset)) + return; + spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, newset); spin_unlock_irq(&tsk->sighand->siglock); @@ -2759,7 +2766,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) int do_sigtimedwait(const sigset_t *which, siginfo_t *info, const struct timespec *ts) { - ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX }; + ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; sigset_t mask = *which; int sig, ret = 0; @@ -2779,7 +2786,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); sig = dequeue_signal(tsk, &mask, info); - if (!sig && timeout.tv64) { + if (!sig && timeout) { /* * None ready, temporarily unblock those we're interested * while we are sleeping in so that we'll be awakened when diff --git a/kernel/sys.c b/kernel/sys.c index 9758892a2d09..842914ef7de4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -57,7 +57,7 @@ /* Move somewhere else to avoid recompiling? */ #include <generated/utsrelease.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/io.h> #include <asm/unistd.h> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 635482e60ca3..8acef8576ce9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -150,6 +150,9 @@ cond_syscall(sys_io_destroy); cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); +cond_syscall(compat_sys_io_setup); +cond_syscall(compat_sys_io_submit); +cond_syscall(compat_sys_io_getevents); cond_syscall(sys_sysfs); cond_syscall(sys_syslog); cond_syscall(sys_process_vm_readv); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 39b3368f6de6..8dbaec0e4f7f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,7 +67,7 @@ #include <linux/bpf.h> #include <linux/mount.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/processor.h> #ifdef CONFIG_X86 @@ -627,7 +627,7 @@ static struct ctl_table kern_table[] = { .data = &tracepoint_printk, .maxlen = sizeof(tracepoint_printk), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = tracepoint_printk_sysctl, }, #endif #ifdef CONFIG_KEXEC_CORE @@ -2389,9 +2389,11 @@ static void validate_coredump_safety(void) #ifdef CONFIG_COREDUMP if (suid_dumpable == SUID_DUMP_ROOT && core_pattern[0] != '/' && core_pattern[0] != '|') { - printk(KERN_WARNING "Unsafe core_pattern used with "\ - "suid_dumpable=2. Pipe handler or fully qualified "\ - "core dump path required.\n"); + printk(KERN_WARNING +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); } #endif } diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 6eb99c17dbd8..ece4b177052b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen) "warning: process `%s' used the deprecated sysctl " "system call with ", current->comm); for (i = 0; i < nlen; i++) - printk("%d.", name[i]); - printk("\n"); + printk(KERN_CONT "%d.", name[i]); + printk(KERN_CONT "\n"); } return; } diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9b08ca391aed..e6dc9a538efa 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -234,7 +234,7 @@ static int alarmtimer_suspend(struct device *dev) min = freezer_delta; expires = freezer_expires; type = freezer_alarmtype; - freezer_delta = ktime_set(0, 0); + freezer_delta = 0; spin_unlock_irqrestore(&freezer_delta_lock, flags); rtc = alarmtimer_get_rtcdev(); @@ -254,13 +254,13 @@ static int alarmtimer_suspend(struct device *dev) if (!next) continue; delta = ktime_sub(next->expires, base->gettime()); - if (!min.tv64 || (delta.tv64 < min.tv64)) { + if (!min || (delta < min)) { expires = next->expires; min = delta; type = i; } } - if (min.tv64 == 0) + if (min == 0) return 0; if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { @@ -277,7 +277,7 @@ static int alarmtimer_suspend(struct device *dev) now = ktime_add(now, min); /* Set alarm, if in the past reject suspend briefly to handle */ - ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); + ret = rtc_timer_start(rtc, &rtctimer, now, 0); if (ret < 0) __pm_wakeup_event(ws, MSEC_PER_SEC); return ret; @@ -328,7 +328,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) delta = ktime_sub(absexp, base->gettime()); spin_lock_irqsave(&freezer_delta_lock, flags); - if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) { + if (!freezer_delta || (delta < freezer_delta)) { freezer_delta = delta; freezer_expires = absexp; freezer_alarmtype = type; @@ -453,10 +453,10 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) delta = ktime_sub(now, alarm->node.expires); - if (delta.tv64 < 0) + if (delta < 0) return 0; - if (unlikely(delta.tv64 >= interval.tv64)) { + if (unlikely(delta >= interval)) { s64 incr = ktime_to_ns(interval); overrun = ktime_divns(delta, incr); @@ -464,7 +464,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) alarm->node.expires = ktime_add_ns(alarm->node.expires, incr*overrun); - if (alarm->node.expires.tv64 > now.tv64) + if (alarm->node.expires > now) return overrun; /* * This (and the ktime_add() below) is the @@ -516,12 +516,13 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, spin_lock_irqsave(&ptr->it_lock, flags); if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { - if (posix_timer_event(ptr, 0) != 0) + if (IS_ENABLED(CONFIG_POSIX_TIMERS) && + posix_timer_event(ptr, 0) != 0) ptr->it_overrun++; } /* Re-add periodic timers */ - if (ptr->it.alarm.interval.tv64) { + if (ptr->it.alarm.interval) { ptr->it_overrun += alarm_forward(alarm, now, ptr->it.alarm.interval); result = ALARMTIMER_RESTART; @@ -729,7 +730,7 @@ static int update_rmtp(ktime_t exp, enum alarmtimer_type type, rem = ktime_sub(exp, alarm_bases[type].gettime()); - if (rem.tv64 <= 0) + if (rem <= 0) return 0; rmt = ktime_to_timespec(rem); @@ -754,7 +755,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) struct alarm alarm; int ret = 0; - exp.tv64 = restart->nanosleep.expires; + exp = restart->nanosleep.expires; alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); if (alarmtimer_do_nsleep(&alarm, exp)) @@ -834,7 +835,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, restart = ¤t->restart_block; restart->fn = alarm_timer_nsleep_restart; restart->nanosleep.clockid = type; - restart->nanosleep.expires = exp.tv64; + restart->nanosleep.expires = exp; restart->nanosleep.rmtp = rmtp; ret = -ERESTART_RESTARTBLOCK; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 2c5bc77c0bb0..97ac0951f164 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -179,7 +179,7 @@ void clockevents_switch_state(struct clock_event_device *dev, void clockevents_shutdown(struct clock_event_device *dev) { clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); - dev->next_event.tv64 = KTIME_MAX; + dev->next_event = KTIME_MAX; } /** @@ -213,7 +213,7 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { printk_deferred(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); - dev->next_event.tv64 = KTIME_MAX; + dev->next_event = KTIME_MAX; return -ETIME; } @@ -310,7 +310,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, int64_t delta; int rc; - if (unlikely(expires.tv64 < 0)) { + if (unlikely(expires < 0)) { WARN_ON_ONCE(1); return -ETIME; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 150242ccfcd2..665985b0a89a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -170,7 +170,7 @@ void clocksource_mark_unstable(struct clocksource *cs) static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; - cycle_t csnow, wdnow, cslast, wdlast, delta; + u64 csnow, wdnow, cslast, wdlast, delta; int64_t wd_nsec, cs_nsec; int next_cpu, reset_pending; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 08be5c99d26b..c6ecedd3b839 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -50,7 +50,7 @@ #include <linux/timer.h> #include <linux/freezer.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <trace/events/timer.h> @@ -171,7 +171,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) return 0; expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires.tv64 <= new_base->cpu_base->expires_next.tv64; + return expires <= new_base->cpu_base->expires_next; #else return 0; #endif @@ -313,7 +313,7 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) * We use KTIME_SEC_MAX here, the maximum timeout which we can * return to user space in a timespec: */ - if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64) + if (res < 0 || res < lhs || res < rhs) res = ktime_set(KTIME_SEC_MAX, 0); return res; @@ -465,8 +465,8 @@ static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; unsigned int active = cpu_base->active_bases; + ktime_t expires, expires_next = KTIME_MAX; hrtimer_update_next_timer(cpu_base, NULL); for (; active; base++, active >>= 1) { @@ -479,7 +479,7 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (expires.tv64 < expires_next.tv64) { + if (expires < expires_next) { expires_next = expires; hrtimer_update_next_timer(cpu_base, timer); } @@ -489,8 +489,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) * the clock bases so the result might be negative. Fix it up * to prevent a false positive in clockevents_program_event(). */ - if (expires_next.tv64 < 0) - expires_next.tv64 = 0; + if (expires_next < 0) + expires_next = 0; return expires_next; } #endif @@ -561,10 +561,10 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) expires_next = __hrtimer_get_next_event(cpu_base); - if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) + if (skip_equal && expires_next == cpu_base->expires_next) return; - cpu_base->expires_next.tv64 = expires_next.tv64; + cpu_base->expires_next = expires_next; /* * If a hang was detected in the last timer interrupt then we @@ -622,10 +622,10 @@ static void hrtimer_reprogram(struct hrtimer *timer, * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. */ - if (expires.tv64 < 0) - expires.tv64 = 0; + if (expires < 0) + expires = 0; - if (expires.tv64 >= cpu_base->expires_next.tv64) + if (expires >= cpu_base->expires_next) return; /* Update the pointer to the next expiring timer */ @@ -653,7 +653,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, */ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { - base->expires_next.tv64 = KTIME_MAX; + base->expires_next = KTIME_MAX; base->hres_active = 0; } @@ -827,21 +827,21 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) delta = ktime_sub(now, hrtimer_get_expires(timer)); - if (delta.tv64 < 0) + if (delta < 0) return 0; if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) return 0; - if (interval.tv64 < hrtimer_resolution) - interval.tv64 = hrtimer_resolution; + if (interval < hrtimer_resolution) + interval = hrtimer_resolution; - if (unlikely(delta.tv64 >= interval.tv64)) { + if (unlikely(delta >= interval)) { s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); hrtimer_add_expires_ns(timer, incr * orun); - if (hrtimer_get_expires_tv64(timer) > now.tv64) + if (hrtimer_get_expires_tv64(timer) > now) return orun; /* * This (and the ktime_add() below) is the @@ -955,7 +955,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) - tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); + tim = ktime_add_safe(tim, hrtimer_resolution); #endif return tim; } @@ -1104,7 +1104,7 @@ u64 hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!__hrtimer_hres_active(cpu_base)) - expires = __hrtimer_get_next_event(cpu_base).tv64; + expires = __hrtimer_get_next_event(cpu_base); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1296,7 +1296,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ - if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) + if (basenow < hrtimer_get_softexpires_tv64(timer)) break; __run_hrtimer(cpu_base, base, timer, &basenow); @@ -1318,7 +1318,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; - dev->next_event.tv64 = KTIME_MAX; + dev->next_event = KTIME_MAX; raw_spin_lock(&cpu_base->lock); entry_time = now = hrtimer_update_base(cpu_base); @@ -1331,7 +1331,7 @@ retry: * timers which run their callback and need to be requeued on * this CPU. */ - cpu_base->expires_next.tv64 = KTIME_MAX; + cpu_base->expires_next = KTIME_MAX; __hrtimer_run_queues(cpu_base, now); @@ -1379,13 +1379,13 @@ retry: cpu_base->hang_detected = 1; raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); - if ((unsigned int)delta.tv64 > cpu_base->max_hang_time) - cpu_base->max_hang_time = (unsigned int) delta.tv64; + if ((unsigned int)delta > cpu_base->max_hang_time) + cpu_base->max_hang_time = (unsigned int) delta; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. */ - if (delta.tv64 > 100 * NSEC_PER_MSEC) + if (delta > 100 * NSEC_PER_MSEC) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); else expires_next = ktime_add(now, delta); @@ -1495,7 +1495,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) ktime_t rem; rem = hrtimer_expires_remaining(timer); - if (rem.tv64 <= 0) + if (rem <= 0) return 0; rmt = ktime_to_timespec(rem); @@ -1693,7 +1693,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, * Optimize when a zero timeout value is given. It does not * matter whether this is an absolute or a relative time. */ - if (expires && !expires->tv64) { + if (expires && *expires == 0) { __set_current_state(TASK_RUNNING); return 0; } diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 2b9f45bc955d..8c89143f9ebf 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -14,7 +14,7 @@ #include <linux/hrtimer.h> #include <trace/events/timer.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /** * itimer_get_remtime - get remaining time for the timer @@ -34,10 +34,10 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) * then we return 0 - which is correct. */ if (hrtimer_active(timer)) { - if (rem.tv64 <= 0) - rem.tv64 = NSEC_PER_USEC; + if (rem <= 0) + rem = NSEC_PER_USEC; } else - rem.tv64 = 0; + rem = 0; return ktime_to_timeval(rem); } @@ -216,12 +216,12 @@ again: goto again; } expires = timeval_to_ktime(value->it_value); - if (expires.tv64 != 0) { + if (expires != 0) { tsk->signal->it_real_incr = timeval_to_ktime(value->it_interval); hrtimer_start(timer, expires, HRTIMER_MODE_REL); } else - tsk->signal->it_real_incr.tv64 = 0; + tsk->signal->it_real_incr = 0; trace_itimer_state(ITIMER_REAL, value, 0); spin_unlock_irq(&tsk->sighand->siglock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 555e21f7b966..a4a0e478e44d 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -59,9 +59,9 @@ #define JIFFIES_SHIFT 8 #endif -static cycle_t jiffies_read(struct clocksource *cs) +static u64 jiffies_read(struct clocksource *cs) { - return (cycle_t) jiffies; + return (u64) jiffies; } static struct clocksource clocksource_jiffies = { diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 6df8927c58a5..edf19cc53140 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -381,7 +381,7 @@ ktime_t ntp_get_next_leap(void) if ((time_state == TIME_INS) && (time_status & STA_INS)) return ktime_set(ntp_next_leap_sec, 0); - ret.tv64 = KTIME_MAX; + ret = KTIME_MAX; return ret; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index f246763c9947..e9e8c10f0d9a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -6,7 +6,7 @@ #include <linux/posix-timers.h> #include <linux/errno.h> #include <linux/math64.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/kernel_stat.h> #include <trace/events/timer.h> #include <linux/tick.h> diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index f2826c35e918..1e6623d76750 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -36,7 +36,7 @@ #include <linux/time.h> #include <linux/mutex.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/list.h> #include <linux/init.h> #include <linux/compiler.h> @@ -359,7 +359,7 @@ static void schedule_next_timer(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - if (timr->it.real.interval.tv64 == 0) + if (timr->it.real.interval == 0) return; timr->it_overrun += (unsigned int) hrtimer_forward(timer, @@ -449,7 +449,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); - if (timr->it.real.interval.tv64 != 0) + if (timr->it.real.interval != 0) si_private = ++timr->it_requeue_pending; if (posix_timer_event(timr, si_private)) { @@ -458,7 +458,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) * we will not get a call back to restart it AND * it should be restarted. */ - if (timr->it.real.interval.tv64 != 0) { + if (timr->it.real.interval != 0) { ktime_t now = hrtimer_cb_get_time(timer); /* @@ -485,9 +485,9 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) */ #ifdef CONFIG_HIGH_RES_TIMERS { - ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ); + ktime_t kj = NSEC_PER_SEC / HZ; - if (timr->it.real.interval.tv64 < kj.tv64) + if (timr->it.real.interval < kj) now = ktime_add(now, kj); } #endif @@ -743,7 +743,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) iv = timr->it.real.interval; /* interval timer ? */ - if (iv.tv64) + if (iv) cur_setting->it_interval = ktime_to_timespec(iv); else if (!hrtimer_active(timer) && (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) @@ -756,13 +756,13 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) * timer move the expiry time forward by intervals, so * expiry is > now. */ - if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) + if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); remaining = __hrtimer_expires_remaining_adjusted(timer, now); /* Return 0 only, when the timer is expired and not pending */ - if (remaining.tv64 <= 0) { + if (remaining <= 0) { /* * A single shot SIGEV_NONE timer must return 0, when * it is expired ! @@ -839,7 +839,7 @@ common_timer_set(struct k_itimer *timr, int flags, common_timer_get(timr, old_setting); /* disable the timer */ - timr->it.real.interval.tv64 = 0; + timr->it.real.interval = 0; /* * careful here. If smp we could be in the "fire" routine which will * be spinning as we hold the lock. But this is ONLY an SMP issue. @@ -924,7 +924,7 @@ retry: static int common_timer_del(struct k_itimer *timer) { - timer->it.real.interval.tv64 = 0; + timer->it.real.interval = 0; if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) return TIMER_RETRY; diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 690b797f522e..a7bb8f33ae07 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -97,7 +97,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); if (clockevent_state_oneshot(&ce_broadcast_hrtimer)) - if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX) + if (ce_broadcast_hrtimer.next_event != KTIME_MAX) return HRTIMER_RESTART; return HRTIMER_NORESTART; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f6aae7977824..3109204c87cc 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -604,14 +604,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) bool bc_local; raw_spin_lock(&tick_broadcast_lock); - dev->next_event.tv64 = KTIME_MAX; - next_event.tv64 = KTIME_MAX; + dev->next_event = KTIME_MAX; + next_event = KTIME_MAX; cpumask_clear(tmpmask); now = ktime_get(); /* Find all expired events */ for_each_cpu(cpu, tick_broadcast_oneshot_mask) { td = &per_cpu(tick_cpu_device, cpu); - if (td->evtdev->next_event.tv64 <= now.tv64) { + if (td->evtdev->next_event <= now) { cpumask_set_cpu(cpu, tmpmask); /* * Mark the remote cpu in the pending mask, so @@ -619,8 +619,8 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) * timer in tick_broadcast_oneshot_control(). */ cpumask_set_cpu(cpu, tick_broadcast_pending_mask); - } else if (td->evtdev->next_event.tv64 < next_event.tv64) { - next_event.tv64 = td->evtdev->next_event.tv64; + } else if (td->evtdev->next_event < next_event) { + next_event = td->evtdev->next_event; next_cpu = cpu; } } @@ -657,7 +657,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) * - There are pending events on sleeping CPUs which were not * in the event mask */ - if (next_event.tv64 != KTIME_MAX) + if (next_event != KTIME_MAX) tick_broadcast_set_event(dev, next_cpu, next_event); raw_spin_unlock(&tick_broadcast_lock); @@ -672,7 +672,7 @@ static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) { if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) return 0; - if (bc->next_event.tv64 == KTIME_MAX) + if (bc->next_event == KTIME_MAX) return 0; return bc->bound_on == cpu ? -EBUSY : 0; } @@ -688,7 +688,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { if (broadcast_needs_cpu(bc, smp_processor_id())) return; - if (dev->next_event.tv64 < bc->next_event.tv64) + if (dev->next_event < bc->next_event) return; } clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); @@ -754,7 +754,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) */ if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) { ret = -EBUSY; - } else if (dev->next_event.tv64 < bc->next_event.tv64) { + } else if (dev->next_event < bc->next_event) { tick_broadcast_set_event(bc, cpu, dev->next_event); /* * In case of hrtimer broadcasts the @@ -789,7 +789,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) /* * Bail out if there is no next event. */ - if (dev->next_event.tv64 == KTIME_MAX) + if (dev->next_event == KTIME_MAX) goto out; /* * If the pending bit is not set, then we are @@ -824,7 +824,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) * nohz fixups. */ now = ktime_get(); - if (dev->next_event.tv64 <= now.tv64) { + if (dev->next_event <= now) { cpumask_set_cpu(cpu, tick_broadcast_force_mask); goto out; } @@ -871,6 +871,9 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { int cpu = smp_processor_id(); + if (!bc) + return; + /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { int was_periodic = clockevent_state_periodic(bc); @@ -894,7 +897,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_next_period); tick_broadcast_set_event(bc, cpu, tick_next_period); } else - bc->next_event.tv64 = KTIME_MAX; + bc->next_event = KTIME_MAX; } else { /* * The first cpu which switches to oneshot mode sets diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 4fcd99e12aa0..49edc1c4f3e6 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -178,8 +178,8 @@ static void tick_setup_device(struct tick_device *td, struct clock_event_device *newdev, int cpu, const struct cpumask *cpumask) { - ktime_t next_event; void (*handler)(struct clock_event_device *) = NULL; + ktime_t next_event = 0; /* * First device setup ? @@ -195,7 +195,7 @@ static void tick_setup_device(struct tick_device *td, else tick_do_timer_cpu = TICK_DO_TIMER_NONE; tick_next_period = ktime_get(); - tick_period = ktime_set(0, NSEC_PER_SEC / HZ); + tick_period = NSEC_PER_SEC / HZ; } /* diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index b51344652330..6b009c207671 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -28,7 +28,7 @@ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - if (unlikely(expires.tv64 == KTIME_MAX)) { + if (unlikely(expires == KTIME_MAX)) { /* * We don't need the clock event device any more, stop it. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 71496a20e670..74e0388cc88d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -58,21 +58,21 @@ static void tick_do_update_jiffies64(ktime_t now) * Do a quick check without holding jiffies_lock: */ delta = ktime_sub(now, last_jiffies_update); - if (delta.tv64 < tick_period.tv64) + if (delta < tick_period) return; /* Reevaluate with jiffies_lock held */ write_seqlock(&jiffies_lock); delta = ktime_sub(now, last_jiffies_update); - if (delta.tv64 >= tick_period.tv64) { + if (delta >= tick_period) { delta = ktime_sub(delta, tick_period); last_jiffies_update = ktime_add(last_jiffies_update, tick_period); /* Slow path for long timeouts */ - if (unlikely(delta.tv64 >= tick_period.tv64)) { + if (unlikely(delta >= tick_period)) { s64 incr = ktime_to_ns(tick_period); ticks = ktime_divns(delta, incr); @@ -101,7 +101,7 @@ static ktime_t tick_init_jiffy_update(void) write_seqlock(&jiffies_lock); /* Did we start the jiffies update yet ? */ - if (last_jiffies_update.tv64 == 0) + if (last_jiffies_update == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; write_sequnlock(&jiffies_lock); @@ -669,7 +669,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&jiffies_lock); - basemono = last_jiffies_update.tv64; + basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; @@ -697,7 +697,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { - tick.tv64 = 0; + tick = 0; /* * Tell the timer code that the base is not idle, i.e. undo @@ -764,10 +764,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, expires = KTIME_MAX; expires = min_t(u64, expires, next_tick); - tick.tv64 = expires; + tick = expires; /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && (expires == dev->next_event.tv64)) + if (ts->tick_stopped && (expires == ts->next_tick)) goto out; /* @@ -787,6 +787,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, trace_tick_stop(1, TICK_DEP_MASK_NONE); } + ts->next_tick = tick; + /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. @@ -802,7 +804,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, else tick_program_event(tick, 1); out: - /* Update the estimated sleep length */ + /* + * Update the estimated sleep length until the next timer + * (not only the tick). + */ ts->sleep_length = ktime_sub(dev->next_event, now); return tick; } @@ -864,7 +869,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { - ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; + ts->sleep_length = NSEC_PER_SEC / HZ; return false; } @@ -914,7 +919,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ts->idle_calls++; expires = tick_nohz_stop_sched_tick(ts, now, cpu); - if (expires.tv64 > 0LL) { + if (expires > 0LL) { ts->idle_sleeps++; ts->idle_expires = expires; } @@ -1051,7 +1056,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - dev->next_event.tv64 = KTIME_MAX; + dev->next_event = KTIME_MAX; tick_sched_do_timer(now); tick_sched_handle(ts, regs); diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index bf38226e5c17..075444e3d48e 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -27,6 +27,7 @@ enum tick_nohz_mode { * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. + * @next_tick: Next tick to be fired when in dynticks mode. * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls @@ -44,6 +45,7 @@ struct tick_sched { unsigned long check_clocks; enum tick_nohz_mode nohz_mode; ktime_t last_tick; + ktime_t next_tick; int inidle; int tick_stopped; unsigned long idle_jiffies; diff --git a/kernel/time/time.c b/kernel/time/time.c index bd62fb8e8e77..a3a9a8a029dc 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -38,7 +38,7 @@ #include <linux/math64.h> #include <linux/ptrace.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unistd.h> #include <generated/timeconst.h> diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index 4687b3104bae..8afd78932bdf 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(timecounter_init); */ static u64 timecounter_read_delta(struct timecounter *tc) { - cycle_t cycle_now, cycle_delta; + u64 cycle_now, cycle_delta; u64 ns_offset; /* read cycle counter: */ @@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(timecounter_read); * time previous to the time stored in the cycle counter. */ static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, - cycle_t cycles, u64 mask, u64 frac) + u64 cycles, u64 mask, u64 frac) { u64 ns = (u64) cycles; @@ -90,7 +90,7 @@ static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, } u64 timecounter_cyc2time(struct timecounter *tc, - cycle_t cycle_tstamp) + u64 cycle_tstamp) { u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; u64 nsec = tc->nsec, frac = tc->frac; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index da233cdf89b0..db087d7e106d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -104,7 +104,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) */ set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, -tk->wall_to_monotonic.tv_nsec); - WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64); + WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); tk->offs_real = timespec64_to_ktime(tmp); @@ -119,10 +119,10 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) #ifdef CONFIG_DEBUG_TIMEKEEPING #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ -static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) +static void timekeeping_check_update(struct timekeeper *tk, u64 offset) { - cycle_t max_cycles = tk->tkr_mono.clock->max_cycles; + u64 max_cycles = tk->tkr_mono.clock->max_cycles; const char *name = tk->tkr_mono.clock->name; if (offset > max_cycles) { @@ -158,10 +158,10 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) } } -static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) +static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) { struct timekeeper *tk = &tk_core.timekeeper; - cycle_t now, last, mask, max, delta; + u64 now, last, mask, max, delta; unsigned int seq; /* @@ -199,12 +199,12 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) return delta; } #else -static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) +static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) { } -static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) +static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) { - cycle_t cycle_now, delta; + u64 cycle_now, delta; /* read clocksource */ cycle_now = tkr->read(tkr->clock); @@ -229,7 +229,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) */ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) { - cycle_t interval; + u64 interval; u64 tmp, ntpinterval; struct clocksource *old_clock; @@ -254,7 +254,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) if (tmp == 0) tmp = 1; - interval = (cycle_t) tmp; + interval = (u64) tmp; tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ @@ -298,8 +298,7 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, - cycle_t delta) +static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, u64 delta) { u64 nsec; @@ -312,16 +311,15 @@ static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, static inline u64 timekeeping_get_ns(struct tk_read_base *tkr) { - cycle_t delta; + u64 delta; delta = timekeeping_get_delta(tkr); return timekeeping_delta_to_ns(tkr, delta); } -static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, - cycle_t cycles) +static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, u64 cycles) { - cycle_t delta; + u64 delta; /* calculate the delta since the last update_wall_time */ delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); @@ -454,9 +452,9 @@ u64 notrace ktime_get_boot_fast_ns(void) EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); /* Suspend-time cycles value for halted fast timekeeper. */ -static cycle_t cycles_at_suspend; +static u64 cycles_at_suspend; -static cycle_t dummy_clock_read(struct clocksource *cs) +static u64 dummy_clock_read(struct clocksource *cs) { return cycles_at_suspend; } @@ -573,7 +571,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); static inline void tk_update_leap_state(struct timekeeper *tk) { tk->next_leap_ktime = ntp_get_next_leap(); - if (tk->next_leap_ktime.tv64 != KTIME_MAX) + if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); } @@ -650,7 +648,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) static void timekeeping_forward_now(struct timekeeper *tk) { struct clocksource *clock = tk->tkr_mono.clock; - cycle_t cycle_now, delta; + u64 cycle_now, delta; u64 nsec; cycle_now = tk->tkr_mono.read(clock); @@ -923,7 +921,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) ktime_t base_real; u64 nsec_raw; u64 nsec_real; - cycle_t now; + u64 now; WARN_ON_ONCE(timekeeping_suspended); @@ -982,8 +980,8 @@ static int scale64_check_overflow(u64 mult, u64 div, u64 *base) * interval is partial_history_cycles. */ static int adjust_historical_crosststamp(struct system_time_snapshot *history, - cycle_t partial_history_cycles, - cycle_t total_history_cycles, + u64 partial_history_cycles, + u64 total_history_cycles, bool discontinuity, struct system_device_crosststamp *ts) { @@ -1047,7 +1045,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history, /* * cycle_between - true if test occurs chronologically between before and after */ -static bool cycle_between(cycle_t before, cycle_t test, cycle_t after) +static bool cycle_between(u64 before, u64 test, u64 after) { if (test > before && test < after) return true; @@ -1077,7 +1075,7 @@ int get_device_system_crosststamp(int (*get_time_fn) { struct system_counterval_t system_counterval; struct timekeeper *tk = &tk_core.timekeeper; - cycle_t cycles, now, interval_start; + u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; u64 nsec_real, nsec_raw; @@ -1138,7 +1136,7 @@ int get_device_system_crosststamp(int (*get_time_fn) * current interval */ if (do_interp) { - cycle_t partial_history_cycles, total_history_cycles; + u64 partial_history_cycles, total_history_cycles; bool discontinuity; /* @@ -1644,7 +1642,7 @@ void timekeeping_resume(void) struct clocksource *clock = tk->tkr_mono.clock; unsigned long flags; struct timespec64 ts_new, ts_delta; - cycle_t cycle_now; + u64 cycle_now; sleeptime_injected = false; read_persistent_clock64(&ts_new); @@ -2010,11 +2008,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) * * Returns the unconsumed cycles. */ -static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, - u32 shift, - unsigned int *clock_set) +static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, + u32 shift, unsigned int *clock_set) { - cycle_t interval = tk->cycle_interval << shift; + u64 interval = tk->cycle_interval << shift; u64 raw_nsecs; /* If the offset is smaller than a shifted interval, do nothing */ @@ -2055,7 +2052,7 @@ void update_wall_time(void) { struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; - cycle_t offset; + u64 offset; int shift = 0, maxshift; unsigned int clock_set = 0; unsigned long flags; @@ -2253,7 +2250,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, } /* Handle leapsecond insertion adjustments */ - if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64)) + if (unlikely(base >= tk->next_leap_ktime)) *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); } while (read_seqcount_retry(&tk_core.seq, seq)); diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 5be76270ec4a..9a18f121f399 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -13,9 +13,9 @@ extern void tk_debug_account_sleep_time(struct timespec64 *t); #endif #ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE -static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) { - cycle_t ret = (now - last) & mask; + u64 ret = (now - last) & mask; /* * Prevent time going backwards by checking the MSB of mask in @@ -24,7 +24,7 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) return ret & ~(mask >> 1) ? 0 : ret; } #else -static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) { return (now - last) & mask; } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ea4fbf8477a9..ec33a6933eae 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -43,7 +43,7 @@ #include <linux/slab.h> #include <linux/compat.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/div64.h> #include <asm/timex.h> diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ba7d8b288bb3..afe6cd1944fc 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -17,7 +17,7 @@ #include <linux/seq_file.h> #include <linux/kallsyms.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "tick-internal.h" diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 087204c733eb..afddded947df 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -43,7 +43,7 @@ #include <linux/seq_file.h> #include <linux/kallsyms.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* * This is our basic unit of interest: a timer expiry event identified diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2a96b063d659..d5038005eb5d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -70,6 +70,7 @@ config FTRACE_NMI_ENTER config EVENT_TRACING select CONTEXT_SWITCH_TRACER + select GLOB bool config CONTEXT_SWITCH_TRACER @@ -133,6 +134,7 @@ config FUNCTION_TRACER select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER + select GLOB help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 33dd57f53f88..eb230f06ba41 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2847,7 +2847,7 @@ static void ftrace_shutdown_sysctl(void) } } -static cycle_t ftrace_update_time; +static u64 ftrace_update_time; unsigned long ftrace_update_tot_cnt; static inline int ops_traces_mod(struct ftrace_ops *ops) @@ -2894,7 +2894,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { struct ftrace_page *pg; struct dyn_ftrace *p; - cycle_t start, stop; + u64 start, stop; unsigned long update_cnt = 0; unsigned long rec_flags = 0; int i; @@ -3511,6 +3511,10 @@ static int ftrace_match(char *str, struct ftrace_glob *g) memcmp(str + slen - g->len, g->search, g->len) == 0) matched = 1; break; + case MATCH_GLOB: + if (glob_match(g->search, str)) + matched = 1; + break; } return matched; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 89a2611a1635..a85739efcc30 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -245,7 +245,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */ -static void * +static __always_inline void * rb_event_data(struct ring_buffer_event *event) { if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) @@ -1798,48 +1798,48 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); -static inline void * +static __always_inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { return bpage->data + index; } -static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) +static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; } -static inline struct ring_buffer_event * +static __always_inline struct ring_buffer_event * rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) { return __rb_page_index(cpu_buffer->reader_page, cpu_buffer->reader_page->read); } -static inline struct ring_buffer_event * +static __always_inline struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned rb_page_commit(struct buffer_page *bpage) +static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) { return local_read(&bpage->page->commit); } /* Size is determined by what has been committed */ -static inline unsigned rb_page_size(struct buffer_page *bpage) +static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { return rb_page_commit(bpage); } -static inline unsigned +static __always_inline unsigned rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) { return rb_page_commit(cpu_buffer->commit_page); } -static inline unsigned +static __always_inline unsigned rb_event_index(struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; @@ -2355,7 +2355,7 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) local_inc(&cpu_buffer->commits); } -static void +static __always_inline void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long max_count; @@ -2410,7 +2410,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) goto again; } -static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2455,7 +2455,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -static inline bool +static __always_inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -2469,7 +2469,7 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_commit_index(cpu_buffer) == index; } -static void +static __always_inline void rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -2702,7 +2702,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; } -static struct ring_buffer_event * +static __always_inline struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 54d5270a5042..d7449783987a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -40,6 +40,7 @@ #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> +#include <linux/trace.h> #include <linux/sched/rt.h> #include "trace.h" @@ -68,6 +69,7 @@ bool __read_mostly tracing_selftest_disabled; /* Pipe tracepoints to printk */ struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; +static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { @@ -234,7 +236,7 @@ static int __init set_tracepoint_printk(char *str) } __setup("tp_printk", set_tracepoint_printk); -unsigned long long ns2usecs(cycle_t nsec) +unsigned long long ns2usecs(u64 nsec) { nsec += 500; do_div(nsec, 1000); @@ -571,7 +573,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, return read; } -static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) +static u64 buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; @@ -585,7 +587,7 @@ static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) return ts; } -cycle_t ftrace_now(int cpu) +u64 ftrace_now(int cpu) { return buffer_ftrace_now(&global_trace.trace_buffer, cpu); } @@ -738,6 +740,31 @@ static inline void ftrace_trace_stack(struct trace_array *tr, #endif +static __always_inline void +trace_event_setup(struct ring_buffer_event *event, + int type, unsigned long flags, int pc) +{ + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, flags, pc); + ent->type = type; +} + +static __always_inline struct ring_buffer_event * +__trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, int pc) +{ + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(buffer, len); + if (event != NULL) + trace_event_setup(event, type, flags, pc); + + return event; +} + static void tracer_tracing_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) @@ -767,6 +794,22 @@ void tracing_on(void) } EXPORT_SYMBOL_GPL(tracing_on); + +static __always_inline void +__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_cmdline_save, true); + + /* If this is the temp buffer, we need to commit fully */ + if (this_cpu_read(trace_buffered_event) == event) { + /* Length is in event->array[0] */ + ring_buffer_write(buffer, event->array[0], &event->array[1]); + /* Release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + } else + ring_buffer_unlock_commit(buffer, event); +} + /** * __trace_puts - write a constant string into the trace buffer. * @ip: The address of the caller @@ -794,8 +837,8 @@ int __trace_puts(unsigned long ip, const char *str, int size) local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, - irq_flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + irq_flags, pc); if (!event) return 0; @@ -842,8 +885,8 @@ int __trace_bputs(unsigned long ip, const char *str) local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, - irq_flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + irq_flags, pc); if (!event) return 0; @@ -1907,35 +1950,19 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #endif ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | - ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); -static __always_inline void -trace_event_setup(struct ring_buffer_event *event, - int type, unsigned long flags, int pc) -{ - struct trace_entry *ent = ring_buffer_event_data(event); - - tracing_generic_entry_update(ent, flags, pc); - ent->type = type; -} - struct ring_buffer_event * trace_buffer_lock_reserve(struct ring_buffer *buffer, int type, unsigned long len, unsigned long flags, int pc) { - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(buffer, len); - if (event != NULL) - trace_event_setup(event, type, flags, pc); - - return event; + return __trace_buffer_lock_reserve(buffer, type, len, flags, pc); } DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); @@ -2049,21 +2076,6 @@ void trace_buffered_event_disable(void) preempt_enable(); } -void -__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) -{ - __this_cpu_write(trace_cmdline_save, true); - - /* If this is the temp buffer, we need to commit fully */ - if (this_cpu_read(trace_buffered_event) == event) { - /* Length is in event->array[0] */ - ring_buffer_write(buffer, event->array[0], &event->array[1]); - /* Release the temp buffer */ - this_cpu_dec(trace_buffered_event_cnt); - } else - ring_buffer_unlock_commit(buffer, event); -} - static struct ring_buffer *temp_buffer; struct ring_buffer_event * @@ -2090,8 +2102,8 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, this_cpu_dec(trace_buffered_event_cnt); } - entry = trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); + entry = __trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer @@ -2100,13 +2112,88 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { *current_rb = temp_buffer; - entry = trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); + entry = __trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); } return entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); +static DEFINE_SPINLOCK(tracepoint_iter_lock); +static DEFINE_MUTEX(tracepoint_printk_mutex); + +static void output_printk(struct trace_event_buffer *fbuffer) +{ + struct trace_event_call *event_call; + struct trace_event *event; + unsigned long flags; + struct trace_iterator *iter = tracepoint_print_iter; + + /* We should never get here if iter is NULL */ + if (WARN_ON_ONCE(!iter)) + return; + + event_call = fbuffer->trace_file->event_call; + if (!event_call || !event_call->event.funcs || + !event_call->event.funcs->trace) + return; + + event = &fbuffer->trace_file->event_call->event; + + spin_lock_irqsave(&tracepoint_iter_lock, flags); + trace_seq_init(&iter->seq); + iter->ent = fbuffer->entry; + event_call->event.funcs->trace(iter, 0, event); + trace_seq_putc(&iter->seq, 0); + printk("%s", iter->seq.buffer); + + spin_unlock_irqrestore(&tracepoint_iter_lock, flags); +} + +int tracepoint_printk_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int save_tracepoint_printk; + int ret; + + mutex_lock(&tracepoint_printk_mutex); + save_tracepoint_printk = tracepoint_printk; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + /* + * This will force exiting early, as tracepoint_printk + * is always zero when tracepoint_printk_iter is not allocated + */ + if (!tracepoint_print_iter) + tracepoint_printk = 0; + + if (save_tracepoint_printk == tracepoint_printk) + goto out; + + if (tracepoint_printk) + static_key_enable(&tracepoint_printk_key.key); + else + static_key_disable(&tracepoint_printk_key.key); + + out: + mutex_unlock(&tracepoint_printk_mutex); + + return ret; +} + +void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) +{ + if (static_key_false(&tracepoint_printk_key.key)) + output_printk(fbuffer); + + event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer, + fbuffer->event, fbuffer->entry, + fbuffer->flags, fbuffer->pc); +} +EXPORT_SYMBOL_GPL(trace_event_buffer_commit); + void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, @@ -2129,6 +2216,139 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, ftrace_trace_userstack(buffer, flags, pc); } +/* + * Similar to trace_buffer_unlock_commit_regs() but do not dump stack. + */ +void +trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + __buffer_unlock_commit(buffer, event); +} + +static void +trace_process_export(struct trace_export *export, + struct ring_buffer_event *event) +{ + struct trace_entry *entry; + unsigned int size = 0; + + entry = ring_buffer_event_data(event); + size = ring_buffer_event_length(event); + export->write(entry, size); +} + +static DEFINE_MUTEX(ftrace_export_lock); + +static struct trace_export __rcu *ftrace_exports_list __read_mostly; + +static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled); + +static inline void ftrace_exports_enable(void) +{ + static_branch_enable(&ftrace_exports_enabled); +} + +static inline void ftrace_exports_disable(void) +{ + static_branch_disable(&ftrace_exports_enabled); +} + +void ftrace_exports(struct ring_buffer_event *event) +{ + struct trace_export *export; + + preempt_disable_notrace(); + + export = rcu_dereference_raw_notrace(ftrace_exports_list); + while (export) { + trace_process_export(export, event); + export = rcu_dereference_raw_notrace(export->next); + } + + preempt_enable_notrace(); +} + +static inline void +add_trace_export(struct trace_export **list, struct trace_export *export) +{ + rcu_assign_pointer(export->next, *list); + /* + * We are entering export into the list but another + * CPU might be walking that list. We need to make sure + * the export->next pointer is valid before another CPU sees + * the export pointer included into the list. + */ + rcu_assign_pointer(*list, export); +} + +static inline int +rm_trace_export(struct trace_export **list, struct trace_export *export) +{ + struct trace_export **p; + + for (p = list; *p != NULL; p = &(*p)->next) + if (*p == export) + break; + + if (*p != export) + return -1; + + rcu_assign_pointer(*p, (*p)->next); + + return 0; +} + +static inline void +add_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + if (*list == NULL) + ftrace_exports_enable(); + + add_trace_export(list, export); +} + +static inline int +rm_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + int ret; + + ret = rm_trace_export(list, export); + if (*list == NULL) + ftrace_exports_disable(); + + return ret; +} + +int register_ftrace_export(struct trace_export *export) +{ + if (WARN_ON_ONCE(!export->write)) + return -1; + + mutex_lock(&ftrace_export_lock); + + add_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ftrace_export); + +int unregister_ftrace_export(struct trace_export *export) +{ + int ret; + + mutex_lock(&ftrace_export_lock); + + ret = rm_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_export); + void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -2139,16 +2359,19 @@ trace_function(struct trace_array *tr, struct ring_buffer_event *event; struct ftrace_entry *entry; - event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), - flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), + flags, pc); if (!event) return; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) { + if (static_branch_unlikely(&ftrace_exports_enabled)) + ftrace_exports(event); __buffer_unlock_commit(buffer, event); + } } #ifdef CONFIG_STACKTRACE @@ -2216,8 +2439,8 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, size *= sizeof(unsigned long); - event = trace_buffer_lock_reserve(buffer, TRACE_STACK, - sizeof(*entry) + size, flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, + sizeof(*entry) + size, flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -2318,8 +2541,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) __this_cpu_inc(user_stack_count); - event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, - sizeof(*entry), flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, + sizeof(*entry), flags, pc); if (!event) goto out_drop_count; entry = ring_buffer_event_data(event); @@ -2489,8 +2712,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -2545,8 +2768,8 @@ __trace_array_vprintk(struct ring_buffer *buffer, local_save_flags(flags); size = sizeof(*entry) + len + 1; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -4055,6 +4278,7 @@ static const char readme_msg[] = " x86-tsc: TSC cycle counter\n" #endif "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" + "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" " tracing_cpumask\t- Limit which CPUs to trace\n" " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" @@ -4066,7 +4290,7 @@ static const char readme_msg[] = "\n available_filter_functions - list of functions that can be filtered on\n" " set_ftrace_filter\t- echo function name in here to only trace these\n" "\t\t\t functions\n" - "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t accepts: func_full_name or glob-matching-pattern\n" "\t modules: Can select a group via module\n" "\t Format: :mod:<module-name>\n" "\t example: echo :mod:ext3 > set_ftrace_filter\n" @@ -5519,21 +5743,18 @@ static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { - unsigned long addr = (unsigned long)ubuf; struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; struct ring_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; - struct page *pages[2]; - void *map_page[2]; - int nr_pages = 1; + const char faulted[] = "<faulted>"; ssize_t written; - int offset; int size; int len; - int ret; - int i; + +/* Used in tracing_mark_raw_write() as well */ +#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */ if (tracing_disabled) return -EINVAL; @@ -5544,60 +5765,33 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - /* - * Userspace is injecting traces into the kernel trace buffer. - * We want to be as non intrusive as possible. - * To do so, we do not want to allocate any special buffers - * or take any locks, but instead write the userspace data - * straight into the ring buffer. - * - * First we need to pin the userspace buffer into memory, - * which, most likely it is, because it just referenced it. - * But there's no guarantee that it is. By using get_user_pages_fast() - * and kmap_atomic/kunmap_atomic() we can get access to the - * pages directly. We then write the data directly into the - * ring buffer. - */ BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - /* check if we cross pages */ - if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) - nr_pages = 2; - - offset = addr & (PAGE_SIZE - 1); - addr &= PAGE_MASK; - - ret = get_user_pages_fast(addr, nr_pages, 0, pages); - if (ret < nr_pages) { - while (--ret >= 0) - put_page(pages[ret]); - written = -EFAULT; - goto out; - } + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ - for (i = 0; i < nr_pages; i++) - map_page[i] = kmap_atomic(pages[i]); + /* If less than "<faulted>", then make sure we can still add that */ + if (cnt < FAULTED_SIZE) + size += FAULTED_SIZE - cnt; - local_save_flags(irq_flags); - size = sizeof(*entry) + cnt + 2; /* possible \n added */ buffer = tr->trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - irq_flags, preempt_count()); - if (!event) { + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, preempt_count()); + if (unlikely(!event)) /* Ring buffer disabled, return as if not open for write */ - written = -EBADF; - goto out_unlock; - } + return -EBADF; entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; - if (nr_pages == 2) { - len = PAGE_SIZE - offset; - memcpy(&entry->buf, map_page[0] + offset, len); - memcpy(&entry->buf[len], map_page[1], cnt - len); + len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + if (len) { + memcpy(&entry->buf, faulted, FAULTED_SIZE); + cnt = FAULTED_SIZE; + written = -EFAULT; } else - memcpy(&entry->buf, map_page[0] + offset, cnt); + written = cnt; + len = cnt; if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; @@ -5607,16 +5801,73 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, __buffer_unlock_commit(buffer, event); - written = cnt; + if (written > 0) + *fpos += written; - *fpos += written; + return written; +} + +/* Limit it for now to 3K (including tag) */ +#define RAW_DATA_MAX_SIZE (1024*3) + +static ssize_t +tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct raw_data_entry *entry; + const char faulted[] = "<faulted>"; + unsigned long irq_flags; + ssize_t written; + int size; + int len; + +#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + + if (tracing_disabled) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + + /* The marker must at least have a tag id */ + if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE) + return -EINVAL; + + if (cnt > TRACE_BUF_SIZE) + cnt = TRACE_BUF_SIZE; + + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt; + if (cnt < FAULT_SIZE_ID) + size += FAULT_SIZE_ID - cnt; + + buffer = tr->trace_buffer.buffer; + event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, + irq_flags, preempt_count()); + if (!event) + /* Ring buffer disabled, return as if not open for write */ + return -EBADF; + + entry = ring_buffer_event_data(event); + + len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + if (len) { + entry->id = -1; + memcpy(&entry->buf, faulted, FAULTED_SIZE); + written = -EFAULT; + } else + written = cnt; + + __buffer_unlock_commit(buffer, event); + + if (written > 0) + *fpos += written; - out_unlock: - for (i = nr_pages - 1; i >= 0; i--) { - kunmap_atomic(map_page[i]); - put_page(pages[i]); - } - out: return written; } @@ -5946,6 +6197,13 @@ static const struct file_operations tracing_mark_fops = { .release = tracing_release_generic_tr, }; +static const struct file_operations tracing_mark_raw_fops = { + .open = tracing_open_generic_tr, + .write = tracing_mark_raw_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + static const struct file_operations trace_clock_fops = { .open = tracing_clock_open, .read = seq_read, @@ -7215,6 +7473,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); + trace_create_file("trace_marker_raw", 0220, d_tracer, + tr, &tracing_mark_raw_fops); + trace_create_file("trace_clock", 0644, d_tracer, tr, &trace_clock_fops); @@ -7752,6 +8013,8 @@ void __init trace_init(void) kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); if (WARN_ON(!tracepoint_print_iter)) tracepoint_printk = 0; + else + static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); trace_event_init(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fd24b1f9ac43..1ea51ab53edf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -15,6 +15,7 @@ #include <linux/trace_events.h> #include <linux/compiler.h> #include <linux/trace_seq.h> +#include <linux/glob.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -39,6 +40,7 @@ enum trace_type { TRACE_BLK, TRACE_BPUTS, TRACE_HWLAT, + TRACE_RAW_DATA, __TRACE_LAST_TYPE, }; @@ -157,7 +159,7 @@ struct trace_array_cpu { unsigned long policy; unsigned long rt_priority; unsigned long skipped_entries; - cycle_t preempt_timestamp; + u64 preempt_timestamp; pid_t pid; kuid_t uid; char comm[TASK_COMM_LEN]; @@ -175,7 +177,7 @@ struct trace_buffer { struct trace_array *tr; struct ring_buffer *buffer; struct trace_array_cpu __percpu *data; - cycle_t time_start; + u64 time_start; int cpu; }; @@ -330,6 +332,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ + IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ @@ -599,8 +602,8 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void __buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event); +void trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer, + struct ring_buffer_event *event); int trace_empty(struct trace_iterator *iter); @@ -686,7 +689,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, } #endif /* CONFIG_STACKTRACE */ -extern cycle_t ftrace_now(int cpu); +extern u64 ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); @@ -733,7 +736,7 @@ extern int trace_selftest_startup_branch(struct tracer *trace, #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); -extern unsigned long long ns2usecs(cycle_t nsec); +extern unsigned long long ns2usecs(u64 nsec); extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); extern int @@ -843,6 +846,17 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ + +extern unsigned int fgraph_max_depth; + +static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace) +{ + /* trace it when it is-nested-in or is a function enabled. */ + return !(trace->depth || ftrace_graph_addr(trace->func)) || + (trace->depth < 0) || + (fgraph_max_depth && trace->depth >= fgraph_max_depth); +} + #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags) @@ -1257,6 +1271,7 @@ enum regex_type { MATCH_FRONT_ONLY, MATCH_MIDDLE_ONLY, MATCH_END_ONLY, + MATCH_GLOB, }; struct regex { diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 0f109c4130d3..e3b488825ae3 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -21,6 +21,8 @@ static u64 bm_stddev; static unsigned int bm_avg; static unsigned int bm_std; +static bool ok_to_run; + /* * This gets called in a loop recording the time it took to write * the tracepoint. What it writes is the time statistics of the last @@ -164,11 +166,21 @@ static int benchmark_event_kthread(void *arg) * When the benchmark tracepoint is enabled, it calls this * function and the thread that calls the tracepoint is created. */ -void trace_benchmark_reg(void) +int trace_benchmark_reg(void) { + if (!ok_to_run) { + pr_warning("trace benchmark cannot be started via kernel command line\n"); + return -EBUSY; + } + bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); - WARN_ON(!bm_event_thread); + if (!bm_event_thread) { + pr_warning("trace benchmark failed to create kernel thread\n"); + return -ENOMEM; + } + + return 0; } /* @@ -182,6 +194,7 @@ void trace_benchmark_unreg(void) return; kthread_stop(bm_event_thread); + bm_event_thread = NULL; strcpy(bm_str, "START"); bm_total = 0; @@ -196,3 +209,12 @@ void trace_benchmark_unreg(void) bm_avg = 0; bm_stddev = 0; } + +static __init int ok_to_run_trace_benchmark(void) +{ + ok_to_run = true; + + return 0; +} + +early_initcall(ok_to_run_trace_benchmark); diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index 3c1df1df4e29..ebdbfc2f2a64 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -6,7 +6,7 @@ #include <linux/tracepoint.h> -extern void trace_benchmark_reg(void); +extern int trace_benchmark_reg(void); extern void trace_benchmark_unreg(void); #define BENCHMARK_EVENT_STRLEN 128 diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 3a2a73716a5b..75489de546b6 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -81,7 +81,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->correct = val == expect; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); out: current->trace_recursion &= ~TRACE_BRANCH_BIT; diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index d1cc37e78f99..eb7396b7e7c3 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -244,6 +244,21 @@ FTRACE_ENTRY(print, print_entry, FILTER_OTHER ); +FTRACE_ENTRY(raw_data, raw_data_entry, + + TRACE_RAW_DATA, + + F_STRUCT( + __field( unsigned int, id ) + __dynamic_array( char, buf ) + ), + + F_printk("id:%04x %08x", + __entry->id, (int)__entry->buf[0]), + + FILTER_OTHER +); + FTRACE_ENTRY(bputs, bputs_entry, TRACE_BPUTS, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 03c0a48c3ac4..93116549a284 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -283,46 +283,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, } EXPORT_SYMBOL_GPL(trace_event_buffer_reserve); -static DEFINE_SPINLOCK(tracepoint_iter_lock); - -static void output_printk(struct trace_event_buffer *fbuffer) -{ - struct trace_event_call *event_call; - struct trace_event *event; - unsigned long flags; - struct trace_iterator *iter = tracepoint_print_iter; - - if (!iter) - return; - - event_call = fbuffer->trace_file->event_call; - if (!event_call || !event_call->event.funcs || - !event_call->event.funcs->trace) - return; - - event = &fbuffer->trace_file->event_call->event; - - spin_lock_irqsave(&tracepoint_iter_lock, flags); - trace_seq_init(&iter->seq); - iter->ent = fbuffer->entry; - event_call->event.funcs->trace(iter, 0, event); - trace_seq_putc(&iter->seq, 0); - printk("%s", iter->seq.buffer); - - spin_unlock_irqrestore(&tracepoint_iter_lock, flags); -} - -void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) -{ - if (tracepoint_printk) - output_printk(fbuffer); - - event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer, - fbuffer->event, fbuffer->entry, - fbuffer->flags, fbuffer->pc); -} -EXPORT_SYMBOL_GPL(trace_event_buffer_commit); - int trace_event_reg(struct trace_event_call *call, enum trace_reg type, void *data) { @@ -742,6 +702,7 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, struct trace_event_call *call; const char *name; int ret = -EINVAL; + int eret = 0; list_for_each_entry(file, &tr->events, list) { @@ -765,9 +726,17 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, if (event && strcmp(event, name) != 0) continue; - ftrace_event_enable_disable(file, set); + ret = ftrace_event_enable_disable(file, set); - ret = 0; + /* + * Save the first error and return that. Some events + * may still have been enabled, but let the user + * know that something went wrong. + */ + if (ret && !eret) + eret = ret; + + ret = eret; } return ret; @@ -2843,20 +2812,32 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } + entry = trace_create_file("enable", 0644, d_events, + tr, &ftrace_tr_enable_fops); + if (!entry) { + pr_warn("Could not create tracefs 'enable' entry\n"); + return -ENOMEM; + } + + /* There are not as crucial, just warn if they are not created */ + entry = tracefs_create_file("set_event_pid", 0644, parent, tr, &ftrace_set_event_pid_fops); + if (!entry) + pr_warn("Could not create tracefs 'set_event_pid' entry\n"); /* ring buffer internal formats */ - trace_create_file("header_page", 0444, d_events, - ring_buffer_print_page_header, - &ftrace_show_header_fops); - - trace_create_file("header_event", 0444, d_events, - ring_buffer_print_entry_header, - &ftrace_show_header_fops); + entry = trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + if (!entry) + pr_warn("Could not create tracefs 'header_page' entry\n"); - trace_create_file("enable", 0644, d_events, - tr, &ftrace_tr_enable_fops); + entry = trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + if (!entry) + pr_warn("Could not create tracefs 'header_event' entry\n"); tr->event_dir = d_events; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9daa9b3bc6d9..59a411ff60c7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -108,12 +108,12 @@ static char *err_text[] = { }; struct opstack_op { - int op; + enum filter_op_ids op; struct list_head list; }; struct postfix_elt { - int op; + enum filter_op_ids op; char *operand; struct list_head list; }; @@ -145,34 +145,50 @@ struct pred_stack { /* If not of not match is equal to not of not, then it is a match */ #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event) \ +static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = 0; \ - \ - switch (pred->op) { \ - case OP_LT: \ - match = (*addr < val); \ - break; \ - case OP_LE: \ - match = (*addr <= val); \ - break; \ - case OP_GT: \ - match = (*addr > val); \ - break; \ - case OP_GE: \ - match = (*addr >= val); \ - break; \ - case OP_BAND: \ - match = (*addr & val); \ - break; \ - default: \ - break; \ - } \ - \ + int match = (*addr < val); \ return !!match == !pred->not; \ -} +} \ +static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = (*addr <= val); \ + return !!match == !pred->not; \ +} \ +static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = (*addr > val); \ + return !!match == !pred->not; \ +} \ +static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = (*addr >= val); \ + return !!match == !pred->not; \ +} \ +static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = !!(*addr & val); \ + return match == !pred->not; \ +} \ +static const filter_pred_fn_t pred_funcs_##type[] = { \ + filter_pred_LT_##type, \ + filter_pred_LE_##type, \ + filter_pred_GT_##type, \ + filter_pred_GE_##type, \ + filter_pred_BAND_##type, \ +}; + +#define PRED_FUNC_START OP_LT #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ @@ -344,6 +360,12 @@ static int regex_match_end(char *str, struct regex *r, int len) return 0; } +static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused) +{ + if (glob_match(r->pattern, str)) + return 1; + return 0; +} /** * filter_parse_regex - parse a basic regex * @buff: the raw regex @@ -380,14 +402,20 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) if (!i) { *search = buff + 1; type = MATCH_END_ONLY; - } else { + } else if (i == len - 1) { if (type == MATCH_END_ONLY) type = MATCH_MIDDLE_ONLY; else type = MATCH_FRONT_ONLY; buff[i] = 0; break; + } else { /* pattern continues, use full glob */ + type = MATCH_GLOB; + break; } + } else if (strchr("[?\\", buff[i])) { + type = MATCH_GLOB; + break; } } @@ -420,6 +448,9 @@ static void filter_build_regex(struct filter_pred *pred) case MATCH_END_ONLY: r->match = regex_match_end; break; + case MATCH_GLOB: + r->match = regex_match_glob; + break; } pred->not ^= not; @@ -946,7 +977,7 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static bool is_legal_op(struct ftrace_event_field *field, int op) +static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op) { if (is_string_field(field) && (op != OP_EQ && op != OP_NE && op != OP_GLOB)) @@ -957,8 +988,8 @@ static bool is_legal_op(struct ftrace_event_field *field, int op) return true; } -static filter_pred_fn_t select_comparison_fn(int op, int field_size, - int field_is_signed) +static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, + int field_size, int field_is_signed) { filter_pred_fn_t fn = NULL; @@ -967,33 +998,33 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, if (op == OP_EQ || op == OP_NE) fn = filter_pred_64; else if (field_is_signed) - fn = filter_pred_s64; + fn = pred_funcs_s64[op - PRED_FUNC_START]; else - fn = filter_pred_u64; + fn = pred_funcs_u64[op - PRED_FUNC_START]; break; case 4: if (op == OP_EQ || op == OP_NE) fn = filter_pred_32; else if (field_is_signed) - fn = filter_pred_s32; + fn = pred_funcs_s32[op - PRED_FUNC_START]; else - fn = filter_pred_u32; + fn = pred_funcs_u32[op - PRED_FUNC_START]; break; case 2: if (op == OP_EQ || op == OP_NE) fn = filter_pred_16; else if (field_is_signed) - fn = filter_pred_s16; + fn = pred_funcs_s16[op - PRED_FUNC_START]; else - fn = filter_pred_u16; + fn = pred_funcs_u16[op - PRED_FUNC_START]; break; case 1: if (op == OP_EQ || op == OP_NE) fn = filter_pred_8; else if (field_is_signed) - fn = filter_pred_s8; + fn = pred_funcs_s8[op - PRED_FUNC_START]; else - fn = filter_pred_u8; + fn = pred_funcs_u8[op - PRED_FUNC_START]; break; } @@ -1166,7 +1197,8 @@ static inline int append_operand_char(struct filter_parse_state *ps, char c) return 0; } -static int filter_opstack_push(struct filter_parse_state *ps, int op) +static int filter_opstack_push(struct filter_parse_state *ps, + enum filter_op_ids op) { struct opstack_op *opstack_op; @@ -1200,7 +1232,7 @@ static int filter_opstack_top(struct filter_parse_state *ps) static int filter_opstack_pop(struct filter_parse_state *ps) { struct opstack_op *opstack_op; - int op; + enum filter_op_ids op; if (filter_opstack_empty(ps)) return OP_NONE; @@ -1245,7 +1277,7 @@ static int postfix_append_operand(struct filter_parse_state *ps, char *operand) return 0; } -static int postfix_append_op(struct filter_parse_state *ps, int op) +static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op) { struct postfix_elt *elt; @@ -1275,8 +1307,8 @@ static void postfix_clear(struct filter_parse_state *ps) static int filter_parse(struct filter_parse_state *ps) { + enum filter_op_ids op, top_op; int in_string = 0; - int op, top_op; char ch; while ((ch = infix_next(ps))) { @@ -1367,7 +1399,8 @@ parse_operand: static struct filter_pred *create_pred(struct filter_parse_state *ps, struct trace_event_call *call, - int op, char *operand1, char *operand2) + enum filter_op_ids op, + char *operand1, char *operand2) { struct ftrace_event_field *field; static struct filter_pred pred; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4e480e870474..d56123cdcc89 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -65,7 +65,7 @@ struct fgraph_data { #define TRACE_GRAPH_INDENT 2 -static unsigned int max_depth; +unsigned int fgraph_max_depth; static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ @@ -358,7 +358,7 @@ int __trace_graph_entry(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->graph_ent = *trace; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } @@ -384,10 +384,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (!ftrace_trace_task(tr)) return 0; - /* trace it when it is-nested-in or is a function enabled. */ - if ((!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) || (trace->depth < 0) || - (max_depth && trace->depth >= max_depth)) + if (ftrace_graph_ignore_func(trace)) + return 0; + + if (ftrace_graph_ignore_irqs()) return 0; /* @@ -469,7 +469,7 @@ void __trace_graph_return(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ret = *trace; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } void trace_graph_return(struct ftrace_graph_ret *trace) @@ -842,6 +842,10 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data = per_cpu_ptr(data->cpu_data, cpu); + /* If a graph tracer ignored set_graph_notrace */ + if (call->depth < -1) + call->depth += FTRACE_NOTRACE_DEPTH; + /* * Comments display at + 1 to depth. Since * this is a leaf function, keep the comments @@ -850,7 +854,8 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data->depth = call->depth - 1; /* No need to keep this function around for this depth */ - if (call->depth < FTRACE_RETFUNC_DEPTH) + if (call->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(call->depth < 0)) cpu_data->enter_funcs[call->depth] = 0; } @@ -880,11 +885,16 @@ print_graph_entry_nested(struct trace_iterator *iter, struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; + /* If a graph tracer ignored set_graph_notrace */ + if (call->depth < -1) + call->depth += FTRACE_NOTRACE_DEPTH; + cpu_data = per_cpu_ptr(data->cpu_data, cpu); cpu_data->depth = call->depth; /* Save this function pointer to see if the exit matches */ - if (call->depth < FTRACE_RETFUNC_DEPTH) + if (call->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(call->depth < 0)) cpu_data->enter_funcs[call->depth] = call->func; } @@ -1114,7 +1124,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, */ cpu_data->depth = trace->depth - 1; - if (trace->depth < FTRACE_RETFUNC_DEPTH) { + if (trace->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(trace->depth < 0)) { if (cpu_data->enter_funcs[trace->depth] != trace->func) func_match = 0; cpu_data->enter_funcs[trace->depth] = 0; @@ -1489,7 +1500,7 @@ graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - max_depth = val; + fgraph_max_depth = val; *ppos += cnt; @@ -1503,7 +1514,7 @@ graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ int n; - n = sprintf(buf, "%d\n", max_depth); + n = sprintf(buf, "%d\n", fgraph_max_depth); return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); } diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b97286c48735..775569ec50d0 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -127,7 +127,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) entry->nmi_count = sample->nmi_count; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* Macros to encapsulate the time capturing infrastructure */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 03cdff84d026..7758bc0617cb 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -175,6 +175,18 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) int ret; int pc; + if (ftrace_graph_ignore_func(trace)) + return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + if (!func_prolog_dec(tr, &data, &flags)) return 0; @@ -286,7 +298,7 @@ static void irqsoff_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static bool report_latency(struct trace_array *tr, cycle_t delta) +static bool report_latency(struct trace_array *tr, u64 delta) { if (tracing_thresh) { if (delta < tracing_thresh) @@ -304,7 +316,7 @@ check_critical_timing(struct trace_array *tr, unsigned long parent_ip, int cpu) { - cycle_t T0, T1, delta; + u64 T0, T1, delta; unsigned long flags; int pc; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index eb6c9f1d3a93..a133ecd741e4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -73,6 +73,17 @@ static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) return !!strchr(trace_kprobe_symbol(tk), ':'); } +static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) +{ + unsigned long nhit = 0; + int cpu; + + for_each_possible_cpu(cpu) + nhit += *per_cpu_ptr(tk->nhit, cpu); + + return nhit; +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -882,14 +893,10 @@ static const struct file_operations kprobe_events_ops = { static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_kprobe *tk = v; - unsigned long nhit = 0; - int cpu; - - for_each_possible_cpu(cpu) - nhit += *per_cpu_ptr(tk->nhit, cpu); seq_printf(m, " %-44s %15lu %15lu\n", - trace_event_name(&tk->tp.call), nhit, + trace_event_name(&tk->tp.call), + trace_kprobe_nhit(tk), tk->rp.kp.nmissed); return 0; @@ -1354,18 +1361,18 @@ fs_initcall(init_kprobe_trace); #ifdef CONFIG_FTRACE_STARTUP_TEST - /* * The "__used" keeps gcc from removing the function symbol - * from the kallsyms table. + * from the kallsyms table. 'noinline' makes sure that there + * isn't an inlined version used by the test method below */ -static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, - int a4, int a5, int a6) +static __used __init noinline int +kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6) { return a1 + a2 + a3 + a4 + a5 + a6; } -static struct trace_event_file * +static struct __init trace_event_file * find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) { struct trace_event_file *file; @@ -1443,12 +1450,25 @@ static __init int kprobe_trace_self_tests_init(void) ret = target(1, 2, 3, 4, 5, 6); + /* + * Not expecting an error here, the check is only to prevent the + * optimizer from removing the call to target() as otherwise there + * are no side-effects and the call is never performed. + */ + if (ret != 21) + warn++; + /* Disable trace points before removing it */ tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting test probe.\n"); warn++; } else { + if (trace_kprobe_nhit(tk) != 1) { + pr_warn("incorrect number of testprobe hits\n"); + warn++; + } + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); @@ -1462,6 +1482,11 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting 2nd test probe.\n"); warn++; } else { + if (trace_kprobe_nhit(tk) != 1) { + pr_warn("incorrect number of testprobe2 hits\n"); + warn++; + } + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3fc20422c166..5d33a7352919 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1288,6 +1288,35 @@ static struct trace_event trace_print_event = { .funcs = &trace_print_funcs, }; +static enum print_line_t trace_raw_data(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct raw_data_entry *field; + int i; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "# %x buf:", field->id); + + for (i = 0; i < iter->ent_size - offsetof(struct raw_data_entry, buf); i++) + trace_seq_printf(&iter->seq, " %02x", + (unsigned char)field->buf[i]); + + trace_seq_putc(&iter->seq, '\n'); + + return trace_handle_return(&iter->seq); +} + +static struct trace_event_functions trace_raw_data_funcs = { + .trace = trace_raw_data, + .raw = trace_raw_data, +}; + +static struct trace_event trace_raw_data_event = { + .type = TRACE_RAW_DATA, + .funcs = &trace_raw_data_funcs, +}; + static struct trace_event *events[] __initdata = { &trace_fn_event, @@ -1299,6 +1328,7 @@ static struct trace_event *events[] __initdata = { &trace_bprint_event, &trace_print_event, &trace_hwlat_event, + &trace_raw_data_event, NULL }; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9d4399b553a3..ddec53b67646 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -239,6 +239,18 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) unsigned long flags; int pc, ret = 0; + if (ftrace_graph_ignore_func(trace)) + return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + if (!func_prolog_preempt_disable(tr, &data, &pc)) return 0; @@ -346,7 +358,7 @@ static void wakeup_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static bool report_latency(struct trace_array *tr, cycle_t delta) +static bool report_latency(struct trace_array *tr, u64 delta) { if (tracing_thresh) { if (delta < tracing_thresh) @@ -428,7 +440,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; - cycle_t T0, T1, delta; + u64 T0, T1, delta; unsigned long flags; long disabled; int cpu; @@ -790,6 +802,7 @@ static struct tracer wakeup_dl_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, + .allow_instances = true, .use_max_tr = true, }; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index d0639d917899..1f9a31f934a4 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -194,9 +194,13 @@ static int tracepoint_add_func(struct tracepoint *tp, struct tracepoint_func *func, int prio) { struct tracepoint_func *old, *tp_funcs; + int ret; - if (tp->regfunc && !static_key_enabled(&tp->key)) - tp->regfunc(); + if (tp->regfunc && !static_key_enabled(&tp->key)) { + ret = tp->regfunc(); + if (ret < 0) + return ret; + } tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); @@ -529,7 +533,7 @@ EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint); /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ static int sys_tracepoint_refcount; -void syscall_regfunc(void) +int syscall_regfunc(void) { struct task_struct *p, *t; @@ -541,6 +545,8 @@ void syscall_regfunc(void) read_unlock(&tasklist_lock); } sys_tracepoint_refcount++; + + return 0; } void syscall_unregfunc(void) diff --git a/kernel/uid16.c b/kernel/uid16.c index cc40793464e3..71645ae9303a 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -14,7 +14,7 @@ #include <linux/security.h> #include <linux/syscalls.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9acb29f280ec..d4b0fa01cae3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,32 +24,14 @@ #include <asm/irq_regs.h> #include <linux/kvm_para.h> -#include <linux/perf_event.h> #include <linux/kthread.h> -/* - * The run state of the lockup detectors is controlled by the content of the - * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - - * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. - * - * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' - * are variables that are only used as an 'interface' between the parameters - * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The - * 'watchdog_thresh' variable is handled differently because its value is not - * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' - * is equal zero. - */ -#define NMI_WATCHDOG_ENABLED_BIT 0 -#define SOFT_WATCHDOG_ENABLED_BIT 1 -#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) -#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) - static DEFINE_MUTEX(watchdog_proc_mutex); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; #else -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif int __read_mostly nmi_watchdog_enabled; int __read_mostly soft_watchdog_enabled; @@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10; #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#else -#define sysctl_softlockup_all_cpu_backtrace 0 -#define sysctl_hardlockup_all_cpu_backtrace 0 #endif static struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static DEFINE_PER_CPU(bool, hard_watchdog_warn); -static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); -static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -#endif static unsigned long soft_lockup_nmi_warn; -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -#ifdef CONFIG_HARDLOCKUP_DETECTOR -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; -static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); -#endif - unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void) wq_watchdog_touch(-1); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR -void touch_nmi_watchdog(void) -{ - /* - * Using __raw here because some code paths have - * preemption enabled. If preemption is enabled - * then interrupts should be enabled too, in which - * case we shouldn't have to worry about the watchdog - * going off. - */ - raw_cpu_write(watchdog_nmi_touch, true); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -#endif - void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); __this_cpu_write(watchdog_touch_ts, 0); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ -static bool is_hardlockup(void) +bool is_hardlockup(void) { unsigned long hrint = __this_cpu_read(hrtimer_interrupts); @@ -299,7 +219,6 @@ static bool is_hardlockup(void) __this_cpu_write(hrtimer_interrupts_saved, hrint); return false; } -#endif static int is_softlockup(unsigned long touch_ts) { @@ -313,78 +232,22 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -#ifdef CONFIG_HARDLOCKUP_DETECTOR - -static struct perf_event_attr wd_hw_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, -}; - -/* Callback function for perf event subsystem */ -static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - /* Ensure the watchdog never gets throttled */ - event->hw.interrupts = 0; - - if (__this_cpu_read(watchdog_nmi_touch) == true) { - __this_cpu_write(watchdog_nmi_touch, false); - return; - } - - /* check for a hardlockup - * This is done by making sure our timer interrupt - * is incrementing. The timer interrupt should have - * fired multiple times before we overflow'd. If it hasn't - * then this is a good indication the cpu is stuck - */ - if (is_hardlockup()) { - int this_cpu = smp_processor_id(); - struct pt_regs *regs = get_irq_regs(); - - /* only print hardlockups once */ - if (__this_cpu_read(hard_watchdog_warn) == true) - return; - - pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - - /* - * Perform all-CPU dump only once to avoid multiple hardlockups - * generating interleaving traces - */ - if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &hardlockup_allcpu_dumped)) - trigger_allbutself_cpu_backtrace(); - - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); - - __this_cpu_write(hard_watchdog_warn, true); - return; - } - - __this_cpu_write(hard_watchdog_warn, false); - return; -} -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - static void watchdog_interrupt_count(void) { __this_cpu_inc(hrtimer_interrupts); } -static int watchdog_nmi_enable(unsigned int cpu); -static void watchdog_nmi_disable(unsigned int cpu); +/* + * These two functions are mostly architecture specific + * defining them as weak here. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} static int watchdog_enable_all_cpus(void); static void watchdog_disable_all_cpus(void); @@ -577,109 +440,6 @@ static void watchdog(unsigned int cpu) watchdog_nmi_disable(cpu); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR -/* - * People like the simple clean cpu node info on boot. - * Reduce the watchdog noise by only printing messages - * that are different from what cpu0 displayed. - */ -static unsigned long cpu0_err; - -static int watchdog_nmi_enable(unsigned int cpu) -{ - struct perf_event_attr *wd_attr; - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - /* nothing to do if the hard lockup detector is disabled */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto out; - - /* is it already setup and enabled? */ - if (event && event->state > PERF_EVENT_STATE_OFF) - goto out; - - /* it is setup but not enabled */ - if (event != NULL) - goto out_enable; - - wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - - /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - - /* save cpu0 error for future comparision */ - if (cpu == 0 && IS_ERR(event)) - cpu0_err = PTR_ERR(event); - - if (!IS_ERR(event)) { - /* only print for cpu0 or different than cpu0 */ - if (cpu == 0 || cpu0_err) - pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); - goto out_save; - } - - /* - * Disable the hard lockup detector if _any_ CPU fails to set up - * set up the hardware perf event. The watchdog() function checks - * the NMI_WATCHDOG_ENABLED bit periodically. - * - * The barriers are for syncing up watchdog_enabled across all the - * cpus, as clear_bit() does not use barriers. - */ - smp_mb__before_atomic(); - clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); - smp_mb__after_atomic(); - - /* skip displaying the same error again */ - if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) - return PTR_ERR(event); - - /* vary the KERN level based on the returned errno */ - if (PTR_ERR(event) == -EOPNOTSUPP) - pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); - else if (PTR_ERR(event) == -ENOENT) - pr_warn("disabled (cpu%i): hardware events not enabled\n", - cpu); - else - pr_err("disabled (cpu%i): unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); - - pr_info("Shutting down hard lockup detector on all cpus\n"); - - return PTR_ERR(event); - - /* success path */ -out_save: - per_cpu(watchdog_ev, cpu) = event; -out_enable: - perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: - return 0; -} - -static void watchdog_nmi_disable(unsigned int cpu) -{ - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - if (event) { - perf_event_disable(event); - per_cpu(watchdog_ev, cpu) = NULL; - - /* should be in cleanup, but blocks oprofile */ - perf_event_release_kernel(event); - } - if (cpu == 0) { - /* watchdog_nmi_enable() expects this to be zero initially. */ - cpu0_err = 0; - } -} - -#else -static int watchdog_nmi_enable(unsigned int cpu) { return 0; } -static void watchdog_nmi_disable(unsigned int cpu) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - static struct smp_hotplug_thread watchdog_threads = { .store = &softlockup_watchdog, .thread_should_run = watchdog_should_run, diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c new file mode 100644 index 000000000000..84016c8aee6b --- /dev/null +++ b/kernel/watchdog_hld.c @@ -0,0 +1,227 @@ +/* + * Detect hard lockups on a system + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Note: Most of this code is borrowed heavily from the original softlockup + * detector, so thanks to Ingo for the initial implementation. + * Some chunks also taken from the old x86-specific nmi watchdog code, thanks + * to those contributors as well. + */ + +#define pr_fmt(fmt) "NMI watchdog: " fmt + +#include <linux/nmi.h> +#include <linux/module.h> +#include <asm/irq_regs.h> +#include <linux/perf_event.h> + +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); + +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static unsigned long hardlockup_allcpu_dumped; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +void touch_nmi_watchdog(void) +{ + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + raw_cpu_write(watchdog_nmi_touch, true); + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +static void watchdog_overflow_callback(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + /* Ensure the watchdog never gets throttled */ + event->hw.interrupts = 0; + + if (__this_cpu_read(watchdog_nmi_touch) == true) { + __this_cpu_write(watchdog_nmi_touch, false); + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + + /* only print hardlockups once */ + if (__this_cpu_read(hard_watchdog_warn) == true) + return; + + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + + __this_cpu_write(hard_watchdog_warn, true); + return; + } + + __this_cpu_write(hard_watchdog_warn, false); + return; +} + +/* + * People like the simple clean cpu node info on boot. + * Reduce the watchdog noise by only printing messages + * that are different from what cpu0 displayed. + */ +static unsigned long cpu0_err; + +int watchdog_nmi_enable(unsigned int cpu) +{ + struct perf_event_attr *wd_attr; + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* nothing to do if the hard lockup detector is disabled */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + goto out; + + /* is it already setup and enabled? */ + if (event && event->state > PERF_EVENT_STATE_OFF) + goto out; + + /* it is setup but not enabled */ + if (event != NULL) + goto out_enable; + + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + + /* Try to register using hardware perf events */ + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + + /* save cpu0 error for future comparision */ + if (cpu == 0 && IS_ERR(event)) + cpu0_err = PTR_ERR(event); + + if (!IS_ERR(event)) { + /* only print for cpu0 or different than cpu0 */ + if (cpu == 0 || cpu0_err) + pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); + goto out_save; + } + + /* + * Disable the hard lockup detector if _any_ CPU fails to set up + * set up the hardware perf event. The watchdog() function checks + * the NMI_WATCHDOG_ENABLED bit periodically. + * + * The barriers are for syncing up watchdog_enabled across all the + * cpus, as clear_bit() does not use barriers. + */ + smp_mb__before_atomic(); + clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); + smp_mb__after_atomic(); + + /* skip displaying the same error again */ + if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + return PTR_ERR(event); + + /* vary the KERN level based on the returned errno */ + if (PTR_ERR(event) == -EOPNOTSUPP) + pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + else if (PTR_ERR(event) == -ENOENT) + pr_warn("disabled (cpu%i): hardware events not enabled\n", + cpu); + else + pr_err("disabled (cpu%i): unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); + + pr_info("Shutting down hard lockup detector on all cpus\n"); + + return PTR_ERR(event); + + /* success path */ +out_save: + per_cpu(watchdog_ev, cpu) = event; +out_enable: + perf_event_enable(per_cpu(watchdog_ev, cpu)); +out: + return 0; +} + +void watchdog_nmi_disable(unsigned int cpu) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) { + perf_event_disable(event); + per_cpu(watchdog_ev, cpu) = NULL; + + /* should be in cleanup, but blocks oprofile */ + perf_event_release_kernel(event); + } + if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ + cpu0_err = 0; + } +} |