summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/audit.c536
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_fsnotify.c15
-rw-r--r--kernel/audit_tree.c23
-rw-r--r--kernel/audit_watch.c17
-rw-r--r--kernel/auditfilter.c5
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/core.c53
-rw-r--r--kernel/bpf/hashtab.c2
-rw-r--r--kernel/bpf/syscall.c46
-rw-r--r--kernel/bpf/verifier.c28
-rw-r--r--kernel/capability.c39
-rw-r--r--kernel/compat.c2
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpu.c258
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/debug/kdb/kdb_io.c37
-rw-r--r--kernel/debug/kdb/kdb_main.c1
-rw-r--r--kernel/debug/kdb/kdb_private.h1
-rw-r--r--kernel/events/core.c177
-rw-r--r--kernel/events/uprobes.c6
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/futex.c4
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/irq/affinity.c6
-rw-r--r--kernel/jump_label.c7
-rw-r--r--kernel/kcov.c13
-rw-r--r--kernel/kexec_core.c5
-rw-r--r--kernel/kexec_file.c145
-rw-r--r--kernel/kexec_internal.h16
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/locking/lockdep.c20
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/qspinlock_stat.h12
-rw-r--r--kernel/memremap.c4
-rw-r--r--kernel/module.c71
-rw-r--r--kernel/padata.c4
-rw-r--r--kernel/panic.c55
-rw-r--r--kernel/pid_namespace.c10
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/printk/printk.c228
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c70
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/tiny.c4
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c33
-rw-r--r--kernel/rcu/tree_exp.h52
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/update.c38
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/seccomp.c7
-rw-r--r--kernel/signal.c19
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/sysctl_binary.c4
-rw-r--r--kernel/time/alarmtimer.c27
-rw-r--r--kernel/time/clockevents.c6
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/hrtimer.c56
-rw-r--r--kernel/time/itimer.c12
-rw-r--r--kernel/time/jiffies.c4
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/posix-cpu-timers.c2
-rw-r--r--kernel/time/posix-timers.c24
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c2
-rw-r--r--kernel/time/tick-broadcast.c27
-rw-r--r--kernel/time/tick-common.c4
-rw-r--r--kernel/time/tick-oneshot.c2
-rw-r--r--kernel/time/tick-sched.c29
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/time/time.c2
-rw-r--r--kernel/time/timecounter.c6
-rw-r--r--kernel/time/timekeeping.c63
-rw-r--r--kernel/time/timekeeping_internal.h6
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/ftrace.c8
-rw-r--r--kernel/trace/ring_buffer.c28
-rw-r--r--kernel/trace/trace.c491
-rw-r--r--kernel/trace/trace.h27
-rw-r--r--kernel/trace/trace_benchmark.c26
-rw-r--r--kernel/trace/trace_benchmark.h2
-rw-r--r--kernel/trace/trace_branch.c2
-rw-r--r--kernel/trace/trace_entries.h15
-rw-r--r--kernel/trace/trace_events.c83
-rw-r--r--kernel/trace/trace_events_filter.c119
-rw-r--r--kernel/trace/trace_functions_graph.c35
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--kernel/trace/trace_irqsoff.c16
-rw-r--r--kernel/trace/trace_kprobe.c47
-rw-r--r--kernel/trace/trace_output.c30
-rw-r--r--kernel/trace/trace_sched_wakeup.c17
-rw-r--r--kernel/tracepoint.c12
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/watchdog.c270
-rw-r--r--kernel/watchdog_hld.c227
109 files changed, 2248 insertions, 1663 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index eb26e12c6c2a..12c679f769c6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
@@ -115,8 +116,6 @@ obj-$(CONFIG_HAS_IOMEM) += memremap.o
$(obj)/configs.o: $(obj)/config_data.h
-# config_data.h contains the same information as ikconfig.h but gzipped.
-# Info from config_data can be extracted from /proc/config*
targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
diff --git a/kernel/audit.c b/kernel/audit.c
index 67b9fbd871be..6e399bb69d7c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -107,7 +107,6 @@ static u32 audit_rate_limit;
* When set to zero, this means unlimited. */
static u32 audit_backlog_limit = 64;
#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
-static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
/* The identity of the user shutting down the audit system. */
@@ -138,11 +137,18 @@ static DEFINE_SPINLOCK(audit_freelist_lock);
static int audit_freelist_count;
static LIST_HEAD(audit_freelist);
-static struct sk_buff_head audit_skb_queue;
-/* queue of skbs to send to auditd when/if it comes back */
-static struct sk_buff_head audit_skb_hold_queue;
+/* queue msgs to send via kauditd_task */
+static struct sk_buff_head audit_queue;
+/* queue msgs due to temporary unicast send problems */
+static struct sk_buff_head audit_retry_queue;
+/* queue msgs waiting for new auditd connection */
+static struct sk_buff_head audit_hold_queue;
+
+/* queue servicing thread */
static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+
+/* waitqueue for callers who are blocked on the audit backlog */
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
@@ -338,7 +344,7 @@ static int audit_set_backlog_limit(u32 limit)
static int audit_set_backlog_wait_time(u32 timeout)
{
return audit_do_config_change("audit_backlog_wait_time",
- &audit_backlog_wait_time_master, timeout);
+ &audit_backlog_wait_time, timeout);
}
static int audit_set_enabled(u32 state)
@@ -365,29 +371,10 @@ static int audit_set_failure(u32 state)
}
/*
- * Queue skbs to be sent to auditd when/if it comes back. These skbs should
- * already have been sent via prink/syslog and so if these messages are dropped
- * it is not a huge concern since we already passed the audit_log_lost()
- * notification and stuff. This is just nice to get audit messages during
- * boot before auditd is running or messages generated while auditd is stopped.
- * This only holds messages is audit_default is set, aka booting with audit=1
- * or building your kernel that way.
- */
-static void audit_hold_skb(struct sk_buff *skb)
-{
- if (audit_default &&
- (!audit_backlog_limit ||
- skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
- skb_queue_tail(&audit_skb_hold_queue, skb);
- else
- kfree_skb(skb);
-}
-
-/*
* For one reason or another this nlh isn't getting delivered to the userspace
* audit daemon, just send it to printk.
*/
-static void audit_printk_skb(struct sk_buff *skb)
+static void kauditd_printk_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
char *data = nlmsg_data(nlh);
@@ -398,58 +385,123 @@ static void audit_printk_skb(struct sk_buff *skb)
else
audit_log_lost("printk limit exceeded");
}
+}
+
+/**
+ * kauditd_hold_skb - Queue an audit record, waiting for auditd
+ * @skb: audit record
+ *
+ * Description:
+ * Queue the audit record, waiting for an instance of auditd. When this
+ * function is called we haven't given up yet on sending the record, but things
+ * are not looking good. The first thing we want to do is try to write the
+ * record via printk and then see if we want to try and hold on to the record
+ * and queue it, if we have room. If we want to hold on to the record, but we
+ * don't have room, record a record lost message.
+ */
+static void kauditd_hold_skb(struct sk_buff *skb)
+{
+ /* at this point it is uncertain if we will ever send this to auditd so
+ * try to send the message via printk before we go any further */
+ kauditd_printk_skb(skb);
+
+ /* can we just silently drop the message? */
+ if (!audit_default) {
+ kfree_skb(skb);
+ return;
+ }
+
+ /* if we have room, queue the message */
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_hold_queue, skb);
+ return;
+ }
- audit_hold_skb(skb);
+ /* we have no other options - drop the message */
+ audit_log_lost("kauditd hold queue overflow");
+ kfree_skb(skb);
}
-static void kauditd_send_skb(struct sk_buff *skb)
+/**
+ * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
+ * @skb: audit record
+ *
+ * Description:
+ * Not as serious as kauditd_hold_skb() as we still have a connected auditd,
+ * but for some reason we are having problems sending it audit records so
+ * queue the given record and attempt to resend.
+ */
+static void kauditd_retry_skb(struct sk_buff *skb)
{
- int err;
- int attempts = 0;
-#define AUDITD_RETRIES 5
+ /* NOTE: because records should only live in the retry queue for a
+ * short period of time, before either being sent or moved to the hold
+ * queue, we don't currently enforce a limit on this queue */
+ skb_queue_tail(&audit_retry_queue, skb);
+}
+
+/**
+ * auditd_reset - Disconnect the auditd connection
+ *
+ * Description:
+ * Break the auditd/kauditd connection and move all the records in the retry
+ * queue into the hold queue in case auditd reconnects. The audit_cmd_mutex
+ * must be held when calling this function.
+ */
+static void auditd_reset(void)
+{
+ struct sk_buff *skb;
+
+ /* break the connection */
+ if (audit_sock) {
+ sock_put(audit_sock);
+ audit_sock = NULL;
+ }
+ audit_pid = 0;
+ audit_nlk_portid = 0;
+
+ /* flush all of the retry queue to the hold queue */
+ while ((skb = skb_dequeue(&audit_retry_queue)))
+ kauditd_hold_skb(skb);
+}
+
+/**
+ * kauditd_send_unicast_skb - Send a record via unicast to auditd
+ * @skb: audit record
+ */
+static int kauditd_send_unicast_skb(struct sk_buff *skb)
+{
+ int rc;
-restart:
- /* take a reference in case we can't send it and we want to hold it */
+ /* if we know nothing is connected, don't even try the netlink call */
+ if (!audit_pid)
+ return -ECONNREFUSED;
+
+ /* get an extra skb reference in case we fail to send */
skb_get(skb);
- err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
- if (err < 0) {
- pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
- audit_pid, err);
- if (audit_pid) {
- if (err == -ECONNREFUSED || err == -EPERM
- || ++attempts >= AUDITD_RETRIES) {
- char s[32];
-
- snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid);
- audit_log_lost(s);
- audit_pid = 0;
- audit_sock = NULL;
- } else {
- pr_warn("re-scheduling(#%d) write to audit_pid=%d\n",
- attempts, audit_pid);
- set_current_state(TASK_INTERRUPTIBLE);
- schedule();
- goto restart;
- }
- }
- /* we might get lucky and get this in the next auditd */
- audit_hold_skb(skb);
- } else
- /* drop the extra reference if sent ok */
+ rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
+ if (rc >= 0) {
consume_skb(skb);
+ rc = 0;
+ }
+
+ return rc;
}
/*
- * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ * kauditd_send_multicast_skb - Send a record to any multicast listeners
+ * @skb: audit record
*
+ * Description:
* This function doesn't consume an skb as might be expected since it has to
* copy it anyways.
*/
-static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
+static void kauditd_send_multicast_skb(struct sk_buff *skb)
{
- struct sk_buff *copy;
- struct audit_net *aunet = net_generic(&init_net, audit_net_id);
- struct sock *sock = aunet->nlsk;
+ struct sk_buff *copy;
+ struct audit_net *aunet = net_generic(&init_net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+ struct nlmsghdr *nlh;
if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
return;
@@ -464,74 +516,161 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
* no reason for new multicast clients to continue with this
* non-compliance.
*/
- copy = skb_copy(skb, gfp_mask);
+ copy = skb_copy(skb, GFP_KERNEL);
if (!copy)
return;
+ nlh = nlmsg_hdr(copy);
+ nlh->nlmsg_len = skb->len;
- nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
+ nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
}
-/*
- * flush_hold_queue - empty the hold queue if auditd appears
- *
- * If auditd just started, drain the queue of messages already
- * sent to syslog/printk. Remember loss here is ok. We already
- * called audit_log_lost() if it didn't go out normally. so the
- * race between the skb_dequeue and the next check for audit_pid
- * doesn't matter.
+/**
+ * kauditd_wake_condition - Return true when it is time to wake kauditd_thread
*
- * If you ever find kauditd to be too slow we can get a perf win
- * by doing our own locking and keeping better track if there
- * are messages in this queue. I don't see the need now, but
- * in 5 years when I want to play with this again I'll see this
- * note and still have no friggin idea what i'm thinking today.
+ * Description:
+ * This function is for use by the wait_event_freezable() call in
+ * kauditd_thread().
*/
-static void flush_hold_queue(void)
+static int kauditd_wake_condition(void)
{
- struct sk_buff *skb;
-
- if (!audit_default || !audit_pid)
- return;
-
- skb = skb_dequeue(&audit_skb_hold_queue);
- if (likely(!skb))
- return;
+ static int pid_last = 0;
+ int rc;
+ int pid = audit_pid;
- while (skb && audit_pid) {
- kauditd_send_skb(skb);
- skb = skb_dequeue(&audit_skb_hold_queue);
- }
+ /* wake on new messages or a change in the connected auditd */
+ rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last);
+ if (rc)
+ pid_last = pid;
- /*
- * if auditd just disappeared but we
- * dequeued an skb we need to drop ref
- */
- consume_skb(skb);
+ return rc;
}
static int kauditd_thread(void *dummy)
{
+ int rc;
+ int auditd = 0;
+ int reschedule = 0;
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+
+#define UNICAST_RETRIES 5
+#define AUDITD_BAD(x,y) \
+ ((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES)
+
+ /* NOTE: we do invalidate the auditd connection flag on any sending
+ * errors, but we only "restore" the connection flag at specific places
+ * in the loop in order to help ensure proper ordering of audit
+ * records */
+
set_freezable();
while (!kthread_should_stop()) {
- struct sk_buff *skb;
-
- flush_hold_queue();
+ /* NOTE: possible area for future improvement is to look at
+ * the hold and retry queues, since only this thread
+ * has access to these queues we might be able to do
+ * our own queuing and skip some/all of the locking */
+
+ /* NOTE: it might be a fun experiment to split the hold and
+ * retry queue handling to another thread, but the
+ * synchronization issues and other overhead might kill
+ * any performance gains */
+
+ /* attempt to flush the hold queue */
+ while (auditd && (skb = skb_dequeue(&audit_hold_queue))) {
+ rc = kauditd_send_unicast_skb(skb);
+ if (rc) {
+ /* requeue to the same spot */
+ skb_queue_head(&audit_hold_queue, skb);
+
+ auditd = 0;
+ if (AUDITD_BAD(rc, reschedule)) {
+ mutex_lock(&audit_cmd_mutex);
+ auditd_reset();
+ mutex_unlock(&audit_cmd_mutex);
+ reschedule = 0;
+ }
+ } else
+ /* we were able to send successfully */
+ reschedule = 0;
+ }
- skb = skb_dequeue(&audit_skb_queue);
+ /* attempt to flush the retry queue */
+ while (auditd && (skb = skb_dequeue(&audit_retry_queue))) {
+ rc = kauditd_send_unicast_skb(skb);
+ if (rc) {
+ auditd = 0;
+ if (AUDITD_BAD(rc, reschedule)) {
+ kauditd_hold_skb(skb);
+ mutex_lock(&audit_cmd_mutex);
+ auditd_reset();
+ mutex_unlock(&audit_cmd_mutex);
+ reschedule = 0;
+ } else
+ /* temporary problem (we hope), queue
+ * to the same spot and retry */
+ skb_queue_head(&audit_retry_queue, skb);
+ } else
+ /* we were able to send successfully */
+ reschedule = 0;
+ }
+ /* standard queue processing, try to be as quick as possible */
+quick_loop:
+ skb = skb_dequeue(&audit_queue);
if (skb) {
- if (!audit_backlog_limit ||
- (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit))
- wake_up(&audit_backlog_wait);
- if (audit_pid)
- kauditd_send_skb(skb);
+ /* setup the netlink header, see the comments in
+ * kauditd_send_multicast_skb() for length quirks */
+ nlh = nlmsg_hdr(skb);
+ nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+
+ /* attempt to send to any multicast listeners */
+ kauditd_send_multicast_skb(skb);
+
+ /* attempt to send to auditd, queue on failure */
+ if (auditd) {
+ rc = kauditd_send_unicast_skb(skb);
+ if (rc) {
+ auditd = 0;
+ if (AUDITD_BAD(rc, reschedule)) {
+ mutex_lock(&audit_cmd_mutex);
+ auditd_reset();
+ mutex_unlock(&audit_cmd_mutex);
+ reschedule = 0;
+ }
+
+ /* move to the retry queue */
+ kauditd_retry_skb(skb);
+ } else
+ /* everything is working so go fast! */
+ goto quick_loop;
+ } else if (reschedule)
+ /* we are currently having problems, move to
+ * the retry queue */
+ kauditd_retry_skb(skb);
else
- audit_printk_skb(skb);
- continue;
- }
+ /* dump the message via printk and hold it */
+ kauditd_hold_skb(skb);
+ } else {
+ /* we have flushed the backlog so wake everyone */
+ wake_up(&audit_backlog_wait);
+
+ /* if everything is okay with auditd (if present), go
+ * to sleep until there is something new in the queue
+ * or we have a change in the connected auditd;
+ * otherwise simply reschedule to give things a chance
+ * to recover */
+ if (reschedule) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ } else
+ wait_event_freezable(kauditd_wait,
+ kauditd_wake_condition());
- wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
+ /* update the auditd connection status */
+ auditd = (audit_pid ? 1 : 0);
+ }
}
+
return 0;
}
@@ -596,6 +735,7 @@ static int audit_send_reply_thread(void *arg)
kfree(reply);
return 0;
}
+
/**
* audit_send_reply - send an audit reply message via netlink
* @request_skb: skb of request we are replying to (used to target the reply)
@@ -832,16 +972,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
return err;
- /* As soon as there's any sign of userspace auditd,
- * start kauditd to talk to it */
- if (!kauditd_task) {
- kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
- if (IS_ERR(kauditd_task)) {
- err = PTR_ERR(kauditd_task);
- kauditd_task = NULL;
- return err;
- }
- }
seq = nlh->nlmsg_seq;
data = nlmsg_data(nlh);
@@ -855,9 +985,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.rate_limit = audit_rate_limit;
s.backlog_limit = audit_backlog_limit;
s.lost = atomic_read(&audit_lost);
- s.backlog = skb_queue_len(&audit_skb_queue);
+ s.backlog = skb_queue_len(&audit_queue);
s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
- s.backlog_wait_time = audit_backlog_wait_time_master;
+ s.backlog_wait_time = audit_backlog_wait_time;
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
@@ -897,9 +1027,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
if (audit_enabled != AUDIT_OFF)
audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
- audit_pid = new_pid;
- audit_nlk_portid = NETLINK_CB(skb).portid;
- audit_sock = skb->sk;
+ if (new_pid) {
+ if (audit_sock)
+ sock_put(audit_sock);
+ audit_pid = new_pid;
+ audit_nlk_portid = NETLINK_CB(skb).portid;
+ sock_hold(skb->sk);
+ audit_sock = skb->sk;
+ } else {
+ auditd_reset();
+ }
+ wake_up_interruptible(&kauditd_wait);
}
if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
err = audit_set_rate_limit(s.rate_limit);
@@ -1167,10 +1305,10 @@ static void __net_exit audit_net_exit(struct net *net)
{
struct audit_net *aunet = net_generic(net, audit_net_id);
struct sock *sock = aunet->nlsk;
- if (sock == audit_sock) {
- audit_pid = 0;
- audit_sock = NULL;
- }
+ mutex_lock(&audit_cmd_mutex);
+ if (sock == audit_sock)
+ auditd_reset();
+ mutex_unlock(&audit_cmd_mutex);
netlink_kernel_release(sock);
aunet->nlsk = NULL;
@@ -1195,17 +1333,24 @@ static int __init audit_init(void)
audit_default ? "enabled" : "disabled");
register_pernet_subsys(&audit_net_ops);
- skb_queue_head_init(&audit_skb_queue);
- skb_queue_head_init(&audit_skb_hold_queue);
+ skb_queue_head_init(&audit_queue);
+ skb_queue_head_init(&audit_retry_queue);
+ skb_queue_head_init(&audit_hold_queue);
audit_initialized = AUDIT_INITIALIZED;
audit_enabled = audit_default;
audit_ever_enabled |= !!audit_default;
- audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
-
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
+ kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+ if (IS_ERR(kauditd_task)) {
+ int err = PTR_ERR(kauditd_task);
+ panic("audit: failed to start the kauditd thread (%d)\n", err);
+ }
+
+ audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+
return 0;
}
__initcall(audit_init);
@@ -1338,24 +1483,6 @@ static inline void audit_get_stamp(struct audit_context *ctx,
}
}
-/*
- * Wait for auditd to drain the queue a little
- */
-static long wait_for_auditd(long sleep_time)
-{
- DECLARE_WAITQUEUE(wait, current);
-
- if (audit_backlog_limit &&
- skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
- add_wait_queue_exclusive(&audit_backlog_wait, &wait);
- set_current_state(TASK_UNINTERRUPTIBLE);
- sleep_time = schedule_timeout(sleep_time);
- remove_wait_queue(&audit_backlog_wait, &wait);
- }
-
- return sleep_time;
-}
-
/**
* audit_log_start - obtain an audit buffer
* @ctx: audit_context (may be NULL)
@@ -1374,12 +1501,9 @@ static long wait_for_auditd(long sleep_time)
struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
int type)
{
- struct audit_buffer *ab = NULL;
- struct timespec t;
- unsigned int uninitialized_var(serial);
- int reserve = 5; /* Allow atomic callers to go up to five
- entries over the normal backlog limit */
- unsigned long timeout_start = jiffies;
+ struct audit_buffer *ab;
+ struct timespec t;
+ unsigned int uninitialized_var(serial);
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
@@ -1387,38 +1511,48 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
return NULL;
- if (gfp_mask & __GFP_DIRECT_RECLAIM) {
- if (audit_pid && audit_pid == current->tgid)
- gfp_mask &= ~__GFP_DIRECT_RECLAIM;
- else
- reserve = 0;
- }
-
- while (audit_backlog_limit
- && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
- long sleep_time;
+ /* don't ever fail/sleep on these two conditions:
+ * 1. auditd generated record - since we need auditd to drain the
+ * queue; also, when we are checking for auditd, compare PIDs using
+ * task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
+ * using a PID anchored in the caller's namespace
+ * 2. audit command message - record types 1000 through 1099 inclusive
+ * are command messages/records used to manage the kernel subsystem
+ * and the audit userspace, blocking on these messages could cause
+ * problems under load so don't do it (note: not all of these
+ * command types are valid as record types, but it is quicker to
+ * just check two ints than a series of ints in a if/switch stmt) */
+ if (!((audit_pid && audit_pid == task_tgid_vnr(current)) ||
+ (type >= 1000 && type <= 1099))) {
+ long sleep_time = audit_backlog_wait_time;
+
+ while (audit_backlog_limit &&
+ (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+ /* wake kauditd to try and flush the queue */
+ wake_up_interruptible(&kauditd_wait);
- sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
- if (sleep_time > 0) {
- sleep_time = wait_for_auditd(sleep_time);
- if (sleep_time > 0)
- continue;
+ /* sleep if we are allowed and we haven't exhausted our
+ * backlog wait limit */
+ if ((gfp_mask & __GFP_DIRECT_RECLAIM) &&
+ (sleep_time > 0)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue_exclusive(&audit_backlog_wait,
+ &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ sleep_time = schedule_timeout(sleep_time);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ } else {
+ if (audit_rate_check() && printk_ratelimit())
+ pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
+ skb_queue_len(&audit_queue),
+ audit_backlog_limit);
+ audit_log_lost("backlog limit exceeded");
+ return NULL;
}
}
- if (audit_rate_check() && printk_ratelimit())
- pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
- skb_queue_len(&audit_skb_queue),
- audit_backlog_limit);
- audit_log_lost("backlog limit exceeded");
- audit_backlog_wait_time = 0;
- wake_up(&audit_backlog_wait);
- return NULL;
}
- if (!reserve && !audit_backlog_wait_time)
- audit_backlog_wait_time = audit_backlog_wait_time_master;
-
ab = audit_buffer_alloc(ctx, gfp_mask, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
@@ -1426,9 +1560,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
}
audit_get_stamp(ab->ctx, &t, &serial);
-
audit_log_format(ab, "audit(%lu.%03lu:%u): ",
t.tv_sec, t.tv_nsec/1000000, serial);
+
return ab;
}
@@ -1759,7 +1893,7 @@ void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
* @call_panic: optional pointer to int that will be updated if secid fails
*/
void audit_log_name(struct audit_context *context, struct audit_names *n,
- struct path *path, int record_num, int *call_panic)
+ const struct path *path, int record_num, int *call_panic)
{
struct audit_buffer *ab;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
@@ -1947,7 +2081,7 @@ EXPORT_SYMBOL(audit_log_task_info);
* @operation: specific link operation
* @link: the path that triggered the restriction
*/
-void audit_log_link_denied(const char *operation, struct path *link)
+void audit_log_link_denied(const char *operation, const struct path *link)
{
struct audit_buffer *ab;
struct audit_names *name;
@@ -1978,10 +2112,10 @@ out:
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
- * netlink_unicast() cannot be called inside an irq context because it blocks
- * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
- * on a queue and a tasklet is scheduled to remove them from the queue outside
- * the irq context. May be called in any context.
+ * We can not do a netlink send inside an irq context because it blocks (last
+ * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
+ * queue and a tasklet is scheduled to remove them from the queue outside the
+ * irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
{
@@ -1990,28 +2124,8 @@ void audit_log_end(struct audit_buffer *ab)
if (!audit_rate_check()) {
audit_log_lost("rate limit exceeded");
} else {
- struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
-
- nlh->nlmsg_len = ab->skb->len;
- kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
-
- /*
- * The original kaudit unicast socket sends up messages with
- * nlmsg_len set to the payload length rather than the entire
- * message length. This breaks the standard set by netlink.
- * The existing auditd daemon assumes this breakage. Fixing
- * this would require co-ordinating a change in the established
- * protocol between the kaudit kernel subsystem and the auditd
- * userspace code.
- */
- nlh->nlmsg_len -= NLMSG_HDRLEN;
-
- if (audit_pid) {
- skb_queue_tail(&audit_skb_queue, ab->skb);
- wake_up_interruptible(&kauditd_wait);
- } else {
- audit_printk_skb(ab->skb);
- }
+ skb_queue_tail(&audit_queue, ab->skb);
+ wake_up_interruptible(&kauditd_wait);
ab->skb = NULL;
}
audit_buffer_free(ab);
diff --git a/kernel/audit.h b/kernel/audit.h
index 431444c3708b..960d49c9db5e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -212,7 +212,7 @@ extern void audit_copy_inode(struct audit_names *name,
extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap);
extern void audit_log_name(struct audit_context *context,
- struct audit_names *n, struct path *path,
+ struct audit_names *n, const struct path *path,
int record_num, int *call_panic);
extern int audit_pid;
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index f84f8d06e1f6..7ea57e516029 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -74,7 +74,7 @@ int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_
}
static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
- struct inode *inode)
+ const struct inode *inode)
{
audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
@@ -130,10 +130,9 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
- audit_log_format(ab, "auid=%u ses=%u op=",
+ audit_log_format(ab, "auid=%u ses=%u op=%s",
from_kuid(&init_user_ns, audit_get_loginuid(current)),
- audit_get_sessionid(current));
- audit_log_string(ab, op);
+ audit_get_sessionid(current), op);
audit_log_format(ab, " path=");
audit_log_untrustedstring(ab, audit_mark->path);
audit_log_key(ab, rule->filterkey);
@@ -168,11 +167,11 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- u32 mask, void *data, int data_type,
+ u32 mask, const void *data, int data_type,
const unsigned char *dname, u32 cookie)
{
struct audit_fsnotify_mark *audit_mark;
- struct inode *inode = NULL;
+ const struct inode *inode = NULL;
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
@@ -180,10 +179,10 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
switch (data_type) {
case (FSNOTIFY_EVENT_PATH):
- inode = ((struct path *)data)->dentry->d_inode;
+ inode = ((const struct path *)data)->dentry->d_inode;
break;
case (FSNOTIFY_EVENT_INODE):
- inode = (struct inode *)data;
+ inode = (const struct inode *)data;
break;
default:
BUG();
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 25772476fa4a..7b44195da81b 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -231,9 +231,11 @@ static void untag_chunk(struct node *p)
if (size)
new = alloc_chunk(size);
+ mutex_lock(&entry->group->mark_mutex);
spin_lock(&entry->lock);
if (chunk->dead || !entry->inode) {
spin_unlock(&entry->lock);
+ mutex_unlock(&entry->group->mark_mutex);
if (new)
free_chunk(new);
goto out;
@@ -251,6 +253,7 @@ static void untag_chunk(struct node *p)
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
+ mutex_unlock(&entry->group->mark_mutex);
fsnotify_destroy_mark(entry, audit_tree_group);
goto out;
}
@@ -258,8 +261,8 @@ static void untag_chunk(struct node *p)
if (!new)
goto Fallback;
- fsnotify_duplicate_mark(&new->mark, entry);
- if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
+ if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode,
+ NULL, 1)) {
fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -293,6 +296,7 @@ static void untag_chunk(struct node *p)
owner->root = new;
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
+ mutex_unlock(&entry->group->mark_mutex);
fsnotify_destroy_mark(entry, audit_tree_group);
fsnotify_put_mark(&new->mark); /* drop initial reference */
goto out;
@@ -309,6 +313,7 @@ Fallback:
put_tree(owner);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
+ mutex_unlock(&entry->group->mark_mutex);
out:
fsnotify_put_mark(entry);
spin_lock(&hash_lock);
@@ -386,18 +391,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk_entry = &chunk->mark;
+ mutex_lock(&old_entry->group->mark_mutex);
spin_lock(&old_entry->lock);
if (!old_entry->inode) {
/* old_entry is being shot, lets just lie */
spin_unlock(&old_entry->lock);
+ mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_put_mark(old_entry);
free_chunk(chunk);
return -ENOENT;
}
- fsnotify_duplicate_mark(chunk_entry, old_entry);
- if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
+ if (fsnotify_add_mark_locked(chunk_entry, old_entry->group,
+ old_entry->inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
+ mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
return -ENOSPC;
@@ -413,6 +421,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk->dead = 1;
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
+ mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_destroy_mark(chunk_entry, audit_tree_group);
@@ -445,6 +454,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&hash_lock);
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
+ mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_destroy_mark(old_entry, audit_tree_group);
fsnotify_put_mark(chunk_entry); /* drop initial reference */
fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
@@ -458,8 +468,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule)
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
- audit_log_format(ab, "op=");
- audit_log_string(ab, "remove_rule");
+ audit_log_format(ab, "op=remove_rule");
audit_log_format(ab, " dir=");
audit_log_untrustedstring(ab, rule->tree->pathname);
audit_log_key(ab, rule->filterkey);
@@ -948,7 +957,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- u32 mask, void *data, int data_type,
+ u32 mask, const void *data, int data_type,
const unsigned char *file_name, u32 cookie)
{
return 0;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0d302a87f21b..f79e4658433d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -242,10 +242,9 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
- audit_log_format(ab, "auid=%u ses=%u op=",
+ audit_log_format(ab, "auid=%u ses=%u op=%s",
from_kuid(&init_user_ns, audit_get_loginuid(current)),
- audit_get_sessionid(current));
- audit_log_string(ab, op);
+ audit_get_sessionid(current), op);
audit_log_format(ab, " path=");
audit_log_untrustedstring(ab, w->path);
audit_log_key(ab, r->filterkey);
@@ -472,10 +471,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- u32 mask, void *data, int data_type,
+ u32 mask, const void *data, int data_type,
const unsigned char *dname, u32 cookie)
{
- struct inode *inode;
+ const struct inode *inode;
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
@@ -484,10 +483,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
switch (data_type) {
case (FSNOTIFY_EVENT_PATH):
- inode = d_backing_inode(((struct path *)data)->dentry);
+ inode = d_backing_inode(((const struct path *)data)->dentry);
break;
case (FSNOTIFY_EVENT_INODE):
- inode = (struct inode *)data;
+ inode = (const struct inode *)data;
break;
default:
BUG();
@@ -548,8 +547,8 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
exe_file = get_task_exe_file(tsk);
if (!exe_file)
return 0;
- ino = exe_file->f_inode->i_ino;
- dev = exe_file->f_inode->i_sb->s_dev;
+ ino = file_inode(exe_file)->i_ino;
+ dev = file_inode(exe_file)->i_sb->s_dev;
fput(exe_file);
return audit_mark_compare(mark, ino, dev);
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 85d9cac497e4..880519d6cf2a 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -363,6 +363,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
case AUDIT_EXIT:
case AUDIT_SUCCESS:
case AUDIT_INODE:
+ case AUDIT_SESSIONID:
/* bit ops are only useful on syscall args */
if (f->op == Audit_bitmask || f->op == Audit_bittest)
return -EINVAL;
@@ -476,6 +477,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
if (!gid_valid(f->gid))
goto exit_free;
break;
+ case AUDIT_SESSIONID:
case AUDIT_ARCH:
entry->rule.arch_f = f;
break;
@@ -1074,8 +1076,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
return;
audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
audit_log_task_context(ab);
- audit_log_format(ab, " op=");
- audit_log_string(ab, action);
+ audit_log_format(ab, " op=%s", action);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
audit_log_end(ab);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2cd5256dbff7..cf1fa43512c1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -446,6 +446,7 @@ static int audit_filter_rules(struct task_struct *tsk,
const struct cred *cred;
int i, need_sid = 1;
u32 sid;
+ unsigned int sessionid;
cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
@@ -508,6 +509,10 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_FSGID:
result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
break;
+ case AUDIT_SESSIONID:
+ sessionid = audit_get_sessionid(current);
+ result = audit_comparator(sessionid, f->op, f->val);
+ break;
case AUDIT_PERS:
result = audit_comparator(tsk->personality, f->op, f->val);
break;
@@ -1000,7 +1005,7 @@ static void audit_log_execve_info(struct audit_context *context,
long len_rem;
long len_full;
long len_buf;
- long len_abuf;
+ long len_abuf = 0;
long len_tmp;
bool require_data;
bool encode;
@@ -2025,8 +2030,11 @@ int audit_set_loginuid(kuid_t loginuid)
goto out;
/* are we setting or clearing? */
- if (uid_valid(loginuid))
+ if (uid_valid(loginuid)) {
sessionid = (unsigned int)atomic_inc_return(&session_id);
+ if (unlikely(sessionid == (unsigned int)-1))
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
+ }
task->sessionid = sessionid;
task->loginuid = loginuid;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index a2ac051c342f..229a5d5df977 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -56,7 +56,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
attr->value_size == 0 || attr->map_flags)
return ERR_PTR(-EINVAL);
- if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
+ if (attr->value_size > KMALLOC_MAX_SIZE)
/* if value_size is bigger, the user space won't be able to
* access the elements.
*/
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 83e0d153b0b4..503d4211988a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -105,19 +105,29 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
gfp_extra_flags;
struct bpf_prog *fp;
+ u32 pages, delta;
+ int ret;
BUG_ON(fp_old == NULL);
size = round_up(size, PAGE_SIZE);
- if (size <= fp_old->pages * PAGE_SIZE)
+ pages = size / PAGE_SIZE;
+ if (pages <= fp_old->pages)
return fp_old;
+ delta = pages - fp_old->pages;
+ ret = __bpf_prog_charge(fp_old->aux->user, delta);
+ if (ret)
+ return NULL;
+
fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
- if (fp != NULL) {
+ if (fp == NULL) {
+ __bpf_prog_uncharge(fp_old->aux->user, delta);
+ } else {
kmemcheck_annotate_bitfield(fp, meta);
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
- fp->pages = size / PAGE_SIZE;
+ fp->pages = pages;
fp->aux->prog = fp;
/* We keep fp->aux from fp_old around in the new
@@ -136,28 +146,30 @@ void __bpf_prog_free(struct bpf_prog *fp)
vfree(fp);
}
-#define SHA_BPF_RAW_SIZE \
- round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES)
-
-/* Called under verifier mutex. */
-void bpf_prog_calc_digest(struct bpf_prog *fp)
+int bpf_prog_calc_tag(struct bpf_prog *fp)
{
const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
- static u32 ws[SHA_WORKSPACE_WORDS];
- static u8 raw[SHA_BPF_RAW_SIZE];
- struct bpf_insn *dst = (void *)raw;
+ u32 raw_size = bpf_prog_tag_scratch_size(fp);
+ u32 digest[SHA_DIGEST_WORDS];
+ u32 ws[SHA_WORKSPACE_WORDS];
u32 i, bsize, psize, blocks;
+ struct bpf_insn *dst;
bool was_ld_map;
- u8 *todo = raw;
+ u8 *raw, *todo;
__be32 *result;
__be64 *bits;
- sha_init(fp->digest);
+ raw = vmalloc(raw_size);
+ if (!raw)
+ return -ENOMEM;
+
+ sha_init(digest);
memset(ws, 0, sizeof(ws));
/* We need to take out the map fd for the digest calculation
* since they are unstable from user space side.
*/
+ dst = (void *)raw;
for (i = 0, was_ld_map = false; i < fp->len; i++) {
dst[i] = fp->insnsi[i];
if (!was_ld_map &&
@@ -177,12 +189,13 @@ void bpf_prog_calc_digest(struct bpf_prog *fp)
}
}
- psize = fp->len * sizeof(struct bpf_insn);
- memset(&raw[psize], 0, sizeof(raw) - psize);
+ psize = bpf_prog_insn_size(fp);
+ memset(&raw[psize], 0, raw_size - psize);
raw[psize++] = 0x80;
bsize = round_up(psize, SHA_MESSAGE_BYTES);
blocks = bsize / SHA_MESSAGE_BYTES;
+ todo = raw;
if (bsize - psize >= sizeof(__be64)) {
bits = (__be64 *)(todo + bsize - sizeof(__be64));
} else {
@@ -192,13 +205,17 @@ void bpf_prog_calc_digest(struct bpf_prog *fp)
*bits = cpu_to_be64((psize - 1) << 3);
while (blocks--) {
- sha_transform(fp->digest, todo, ws);
+ sha_transform(digest, todo, ws);
todo += SHA_MESSAGE_BYTES;
}
- result = (__force __be32 *)fp->digest;
+ result = (__force __be32 *)digest;
for (i = 0; i < SHA_DIGEST_WORDS; i++)
- result[i] = cpu_to_be32(fp->digest[i]);
+ result[i] = cpu_to_be32(digest[i]);
+ memcpy(fp->tag, result, sizeof(fp->tag));
+
+ vfree(raw);
+ return 0;
}
static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 34debc1a9641..3f2bb58952d8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -274,7 +274,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
goto free_htab;
- if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) -
+ if (htab->map.value_size >= KMALLOC_MAX_SIZE -
MAX_BPF_STACK - sizeof(struct htab_elem))
/* if value_size is bigger, the user space won't be able to
* access the elements via bpf syscall. This check also makes
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4819ec9d95f6..1d6b29e4e2c3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -615,19 +615,39 @@ static void free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
+int __bpf_prog_charge(struct user_struct *user, u32 pages)
+{
+ unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ unsigned long user_bufs;
+
+ if (user) {
+ user_bufs = atomic_long_add_return(pages, &user->locked_vm);
+ if (user_bufs > memlock_limit) {
+ atomic_long_sub(pages, &user->locked_vm);
+ return -EPERM;
+ }
+ }
+
+ return 0;
+}
+
+void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
+{
+ if (user)
+ atomic_long_sub(pages, &user->locked_vm);
+}
+
static int bpf_prog_charge_memlock(struct bpf_prog *prog)
{
struct user_struct *user = get_current_user();
- unsigned long memlock_limit;
-
- memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ int ret;
- atomic_long_add(prog->pages, &user->locked_vm);
- if (atomic_long_read(&user->locked_vm) > memlock_limit) {
- atomic_long_sub(prog->pages, &user->locked_vm);
+ ret = __bpf_prog_charge(user, prog->pages);
+ if (ret) {
free_uid(user);
- return -EPERM;
+ return ret;
}
+
prog->aux->user = user;
return 0;
}
@@ -636,7 +656,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
{
struct user_struct *user = prog->aux->user;
- atomic_long_sub(prog->pages, &user->locked_vm);
+ __bpf_prog_uncharge(user, prog->pages);
free_uid(user);
}
@@ -668,17 +688,17 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
- char prog_digest[sizeof(prog->digest) * 2 + 1] = { };
+ char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_digest, prog->digest, sizeof(prog->digest));
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
- "prog_digest:\t%s\n"
+ "prog_tag:\t%s\n"
"memlock:\t%llu\n",
prog->type,
prog->jited,
- prog_digest,
+ prog_tag,
prog->pages * 1ULL << PAGE_SHIFT);
}
#endif
@@ -811,7 +831,7 @@ static int bpf_prog_load(union bpf_attr *attr)
err = -EFAULT;
if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
- prog->len * sizeof(struct bpf_insn)) != 0)
+ bpf_prog_insn_size(prog)) != 0)
goto free_prog;
prog->orig_prog = NULL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d28f9a3380a9..cdc43b899f28 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -462,14 +462,19 @@ static void init_reg_state(struct bpf_reg_state *regs)
regs[BPF_REG_1].type = PTR_TO_CTX;
}
-static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
{
- BUG_ON(regno >= MAX_BPF_REG);
regs[regno].type = UNKNOWN_VALUE;
regs[regno].id = 0;
regs[regno].imm = 0;
}
+static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+ __mark_reg_unknown_value(regs, regno);
+}
+
static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
{
regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
@@ -1970,8 +1975,13 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
reg->type = type;
+ /* We don't need id from this point onwards anymore, thus we
+ * should better reset it, so that state pruning has chances
+ * to take effect.
+ */
+ reg->id = 0;
if (type == UNKNOWN_VALUE)
- mark_reg_unknown_value(regs, regno);
+ __mark_reg_unknown_value(regs, regno);
}
}
@@ -1982,16 +1992,16 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
enum bpf_reg_type type)
{
struct bpf_reg_state *regs = state->regs;
+ u32 id = regs[regno].id;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- mark_map_reg(regs, i, regs[regno].id, type);
+ mark_map_reg(regs, i, id, type);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
- mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE,
- regs[regno].id, type);
+ mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
}
}
@@ -2926,6 +2936,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
int insn_cnt = env->prog->len;
int i, j, err;
+ err = bpf_prog_calc_tag(env->prog);
+ if (err)
+ return err;
+
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
@@ -3173,8 +3187,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
log_level = 0;
}
- bpf_prog_calc_digest(env->prog);
-
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
diff --git a/kernel/capability.c b/kernel/capability.c
index 00411c82dac5..f97fe77ceb88 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -17,7 +17,7 @@
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* Leveraged for setting/resetting capabilities
@@ -318,6 +318,7 @@ bool has_capability(struct task_struct *t, int cap)
{
return has_ns_capability(t, &init_user_ns, cap);
}
+EXPORT_SYMBOL(has_capability);
/**
* has_ns_capability_noaudit - Does a task have a capability (unaudited)
@@ -457,6 +458,19 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
EXPORT_SYMBOL(file_ns_capable);
/**
+ * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
+ * @ns: The user namespace in question
+ * @inode: The inode in question
+ *
+ * Return true if the inode uid and gid are within the namespace.
+ */
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+{
+ return kuid_has_mapping(ns, inode->i_uid) &&
+ kgid_has_mapping(ns, inode->i_gid);
+}
+
+/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
@@ -469,7 +483,26 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
- return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
- kgid_has_mapping(ns, inode->i_gid);
+ return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
+
+/**
+ * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
+ * @tsk: The task that may be ptraced
+ * @ns: The user namespace to search for CAP_SYS_PTRACE in
+ *
+ * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
+ * in the specified user namespace.
+ */
+bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
+{
+ int ret = 0; /* An absent tracer adds no restrictions */
+ const struct cred *cred;
+ rcu_read_lock();
+ cred = rcu_dereference(tsk->ptracer_cred);
+ if (cred)
+ ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+ rcu_read_unlock();
+ return (ret == 0);
+}
diff --git a/kernel/compat.c b/kernel/compat.c
index b3a047f208a7..19aec5d98108 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -28,7 +28,7 @@
#include <linux/ptrace.h>
#include <linux/gfp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
{
diff --git a/kernel/configs.c b/kernel/configs.c
index c18b1f1ae515..2df132b20217 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -28,7 +28,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/**************************************************/
/* the actual current config file */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 217fd2e7f435..c47506357519 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -183,23 +183,16 @@ EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
/*
* The following two APIs (cpu_maps_update_begin/done) must be used when
* attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
- * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
- * hotplug callback (un)registration performed using __register_cpu_notifier()
- * or __unregister_cpu_notifier().
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
-EXPORT_SYMBOL(cpu_notifier_register_begin);
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
-EXPORT_SYMBOL(cpu_notifier_register_done);
-
-static RAW_NOTIFIER_HEAD(cpu_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
@@ -349,66 +342,7 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
-/* Need to know about CPUs going up/down? */
-int register_cpu_notifier(struct notifier_block *nb)
-{
- int ret;
- cpu_maps_update_begin();
- ret = raw_notifier_chain_register(&cpu_chain, nb);
- cpu_maps_update_done();
- return ret;
-}
-
-int __register_cpu_notifier(struct notifier_block *nb)
-{
- return raw_notifier_chain_register(&cpu_chain, nb);
-}
-
-static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call,
- int *nr_calls)
-{
- unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0;
- void *hcpu = (void *)(long)cpu;
-
- int ret;
-
- ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call,
- nr_calls);
-
- return notifier_to_errno(ret);
-}
-
-static int cpu_notify(unsigned long val, unsigned int cpu)
-{
- return __cpu_notify(val, cpu, -1, NULL);
-}
-
-static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
-{
- BUG_ON(cpu_notify(val, cpu));
-}
-
/* Notifier wrappers for transitioning to state machine */
-static int notify_prepare(unsigned int cpu)
-{
- int nr_calls = 0;
- int ret;
-
- ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls);
- if (ret) {
- nr_calls--;
- printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
- __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL);
- }
- return ret;
-}
-
-static int notify_online(unsigned int cpu)
-{
- cpu_notify(CPU_ONLINE, cpu);
- return 0;
-}
static int bringup_wait_for_ap(unsigned int cpu)
{
@@ -433,10 +367,8 @@ static int bringup_cpu(unsigned int cpu)
/* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle);
irq_unlock_sparse();
- if (ret) {
- cpu_notify(CPU_UP_CANCELED, cpu);
+ if (ret)
return ret;
- }
ret = bringup_wait_for_ap(cpu);
BUG_ON(!cpu_online(cpu));
return ret;
@@ -565,11 +497,6 @@ static void cpuhp_thread_fun(unsigned int cpu)
BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
undo_cpu_down(cpu, st);
- /*
- * This is a momentary workaround to keep the notifier users
- * happy. Will go away once we got rid of the notifiers.
- */
- cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
st->rollback = false;
} else {
/* Cannot happen .... */
@@ -659,22 +586,6 @@ void __init cpuhp_threads_init(void)
kthread_unpark(this_cpu_read(cpuhp_state.thread));
}
-EXPORT_SYMBOL(register_cpu_notifier);
-EXPORT_SYMBOL(__register_cpu_notifier);
-void unregister_cpu_notifier(struct notifier_block *nb)
-{
- cpu_maps_update_begin();
- raw_notifier_chain_unregister(&cpu_chain, nb);
- cpu_maps_update_done();
-}
-EXPORT_SYMBOL(unregister_cpu_notifier);
-
-void __unregister_cpu_notifier(struct notifier_block *nb)
-{
- raw_notifier_chain_unregister(&cpu_chain, nb);
-}
-EXPORT_SYMBOL(__unregister_cpu_notifier);
-
#ifdef CONFIG_HOTPLUG_CPU
/**
* clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
@@ -741,20 +652,6 @@ static inline void check_for_tasks(int dead_cpu)
read_unlock(&tasklist_lock);
}
-static int notify_down_prepare(unsigned int cpu)
-{
- int err, nr_calls = 0;
-
- err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls);
- if (err) {
- nr_calls--;
- __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL);
- pr_warn("%s: attempt to take down CPU %u failed\n",
- __func__, cpu);
- }
- return err;
-}
-
/* Take this CPU down. */
static int take_cpu_down(void *_param)
{
@@ -833,13 +730,6 @@ static int takedown_cpu(unsigned int cpu)
return 0;
}
-static int notify_dead(unsigned int cpu)
-{
- cpu_notify_nofail(CPU_DEAD, cpu);
- check_for_tasks(cpu);
- return 0;
-}
-
static void cpuhp_complete_idle_dead(void *arg)
{
struct cpuhp_cpu_state *st = arg;
@@ -863,9 +753,7 @@ void cpuhp_report_idle_dead(void)
}
#else
-#define notify_down_prepare NULL
#define takedown_cpu NULL
-#define notify_dead NULL
#endif
#ifdef CONFIG_HOTPLUG_CPU
@@ -924,9 +812,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
out:
cpu_hotplug_done();
- /* This post dead nonsense must die */
- if (!ret && hasdied)
- cpu_notify_nofail(CPU_POST_DEAD, cpu);
return ret;
}
@@ -1292,17 +1177,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
.teardown.single = rcutree_dead_cpu,
},
/*
- * Preparatory and dead notifiers. Will be replaced once the notifiers
- * are converted to states.
- */
- [CPUHP_NOTIFY_PREPARE] = {
- .name = "notify:prepare",
- .startup.single = notify_prepare,
- .teardown.single = notify_dead,
- .skip_onerr = true,
- .cant_stop = true,
- },
- /*
* On the tear-down path, timers_dead_cpu() must be invoked
* before blk_mq_queue_reinit_notify() from notify_dead(),
* otherwise a RCU stall occurs.
@@ -1391,17 +1265,6 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.startup.single = rcutree_online_cpu,
.teardown.single = rcutree_offline_cpu,
},
-
- /*
- * Online/down_prepare notifiers. Will be removed once the notifiers
- * are converted to states.
- */
- [CPUHP_AP_NOTIFY_ONLINE] = {
- .name = "notify:online",
- .startup.single = notify_online,
- .teardown.single = notify_down_prepare,
- .skip_onerr = true,
- },
#endif
/*
* The dynamically registered state space is here
@@ -1432,23 +1295,67 @@ static int cpuhp_cb_check(enum cpuhp_state state)
return 0;
}
-static void cpuhp_store_callbacks(enum cpuhp_state state,
- const char *name,
- int (*startup)(unsigned int cpu),
- int (*teardown)(unsigned int cpu),
- bool multi_instance)
+/*
+ * Returns a free for dynamic slot assignment of the Online state. The states
+ * are protected by the cpuhp_slot_states mutex and an empty slot is identified
+ * by having no name assigned.
+ */
+static int cpuhp_reserve_state(enum cpuhp_state state)
+{
+ enum cpuhp_state i, end;
+ struct cpuhp_step *step;
+
+ switch (state) {
+ case CPUHP_AP_ONLINE_DYN:
+ step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN;
+ end = CPUHP_AP_ONLINE_DYN_END;
+ break;
+ case CPUHP_BP_PREPARE_DYN:
+ step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN;
+ end = CPUHP_BP_PREPARE_DYN_END;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ for (i = state; i <= end; i++, step++) {
+ if (!step->name)
+ return i;
+ }
+ WARN(1, "No more dynamic states available for CPU hotplug\n");
+ return -ENOSPC;
+}
+
+static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu),
+ bool multi_instance)
{
/* (Un)Install the callbacks for further cpu hotplug operations */
struct cpuhp_step *sp;
+ int ret = 0;
mutex_lock(&cpuhp_state_mutex);
+
+ if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) {
+ ret = cpuhp_reserve_state(state);
+ if (ret < 0)
+ goto out;
+ state = ret;
+ }
sp = cpuhp_get_step(state);
+ if (name && sp->name) {
+ ret = -EBUSY;
+ goto out;
+ }
sp->startup.single = startup;
sp->teardown.single = teardown;
sp->name = name;
sp->multi_instance = multi_instance;
INIT_HLIST_HEAD(&sp->list);
+out:
mutex_unlock(&cpuhp_state_mutex);
+ return ret;
}
static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
@@ -1509,29 +1416,6 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
}
}
-/*
- * Returns a free for dynamic slot assignment of the Online state. The states
- * are protected by the cpuhp_slot_states mutex and an empty slot is identified
- * by having no name assigned.
- */
-static int cpuhp_reserve_state(enum cpuhp_state state)
-{
- enum cpuhp_state i;
-
- mutex_lock(&cpuhp_state_mutex);
- for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
- if (cpuhp_ap_states[i].name)
- continue;
-
- cpuhp_ap_states[i].name = "Reserved";
- mutex_unlock(&cpuhp_state_mutex);
- return i;
- }
- mutex_unlock(&cpuhp_state_mutex);
- WARN(1, "No more dynamic states available for CPU hotplug\n");
- return -ENOSPC;
-}
-
int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
bool invoke)
{
@@ -1580,13 +1464,19 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
/**
* __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
- * @state: The state to setup
- * @invoke: If true, the startup function is invoked for cpus where
- * cpu state >= @state
- * @startup: startup callback function
- * @teardown: teardown callback function
+ * @state: The state to setup
+ * @invoke: If true, the startup function is invoked for cpus where
+ * cpu state >= @state
+ * @startup: startup callback function
+ * @teardown: teardown callback function
+ * @multi_instance: State is set up for multiple instances which get
+ * added afterwards.
*
- * Returns 0 if successful, otherwise a proper error code
+ * Returns:
+ * On success:
+ * Positive state number if @state is CPUHP_AP_ONLINE_DYN
+ * 0 for all other states
+ * On failure: proper (negative) error code
*/
int __cpuhp_setup_state(enum cpuhp_state state,
const char *name, bool invoke,
@@ -1595,25 +1485,23 @@ int __cpuhp_setup_state(enum cpuhp_state state,
bool multi_instance)
{
int cpu, ret = 0;
- int dyn_state = 0;
+ bool dynstate;
if (cpuhp_cb_check(state) || !name)
return -EINVAL;
get_online_cpus();
- /* currently assignments for the ONLINE state are possible */
- if (state == CPUHP_AP_ONLINE_DYN) {
- dyn_state = 1;
- ret = cpuhp_reserve_state(state);
- if (ret < 0)
- goto out;
+ ret = cpuhp_store_callbacks(state, name, startup, teardown,
+ multi_instance);
+
+ dynstate = state == CPUHP_AP_ONLINE_DYN;
+ if (ret > 0 && dynstate) {
state = ret;
+ ret = 0;
}
- cpuhp_store_callbacks(state, name, startup, teardown, multi_instance);
-
- if (!invoke || !startup)
+ if (ret || !invoke || !startup)
goto out;
/*
@@ -1637,7 +1525,11 @@ int __cpuhp_setup_state(enum cpuhp_state state,
}
out:
put_online_cpus();
- if (!ret && dyn_state)
+ /*
+ * If the requested state is CPUHP_AP_ONLINE_DYN, return the
+ * dynamically allocated state in case of success.
+ */
+ if (!ret && dynstate)
return state;
return ret;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 29f815d2ef7e..b3088886cd37 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -55,7 +55,7 @@
#include <linux/backing-dev.h>
#include <linux/sort.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/cgroup.h>
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0874e2edd275..79517e5549f1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -598,11 +598,11 @@ return_normal:
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- time_left = loops_per_jiffy * HZ;
+ time_left = MSEC_PER_SEC;
while (kgdb_do_roundup && --time_left &&
(atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
online_cpus)
- cpu_relax();
+ udelay(1000);
if (!time_left)
pr_crit("Timed out waiting for secondary CPUs.\n");
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 98c9011eac78..e74be38245ad 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -30,6 +30,7 @@
char kdb_prompt_str[CMD_BUFLEN];
int kdb_trap_printk;
+int kdb_printf_cpu = -1;
static int kgdb_transition_check(char *buffer)
{
@@ -554,31 +555,26 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
int linecount;
int colcount;
int logging, saved_loglevel = 0;
- int saved_trap_printk;
- int got_printf_lock = 0;
int retlen = 0;
int fnd, len;
+ int this_cpu, old_cpu;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> ";
struct console *c = console_drivers;
- static DEFINE_SPINLOCK(kdb_printf_lock);
unsigned long uninitialized_var(flags);
- preempt_disable();
- saved_trap_printk = kdb_trap_printk;
- kdb_trap_printk = 0;
-
/* Serialize kdb_printf if multiple cpus try to write at once.
* But if any cpu goes recursive in kdb, just print the output,
* even if it is interleaved with any other text.
*/
- if (!KDB_STATE(PRINTF_LOCK)) {
- KDB_STATE_SET(PRINTF_LOCK);
- spin_lock_irqsave(&kdb_printf_lock, flags);
- got_printf_lock = 1;
- atomic_inc(&kdb_event);
- } else {
- __acquire(kdb_printf_lock);
+ local_irq_save(flags);
+ this_cpu = smp_processor_id();
+ for (;;) {
+ old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu);
+ if (old_cpu == -1 || old_cpu == this_cpu)
+ break;
+
+ cpu_relax();
}
diag = kdbgetintenv("LINES", &linecount);
@@ -847,16 +843,9 @@ kdb_print_out:
suspend_grep = 0; /* end of what may have been a recursive call */
if (logging)
console_loglevel = saved_loglevel;
- if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
- got_printf_lock = 0;
- spin_unlock_irqrestore(&kdb_printf_lock, flags);
- KDB_STATE_CLEAR(PRINTF_LOCK);
- atomic_dec(&kdb_event);
- } else {
- __release(kdb_printf_lock);
- }
- kdb_trap_printk = saved_trap_printk;
- preempt_enable();
+ /* kdb_printf_cpu locked the code above. */
+ smp_store_release(&kdb_printf_cpu, old_cpu);
+ local_irq_restore(flags);
return retlen;
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2a20c0dfdafc..ca183919d302 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -60,7 +60,6 @@ int kdb_grep_trailing;
* Kernel debugger state flags
*/
int kdb_flags;
-atomic_t kdb_event;
/*
* kdb_lock protects updates to kdb_initial_cpu. Used to
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 75014d7f4568..fc224fbcf954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -132,7 +132,6 @@ extern int kdb_state;
#define KDB_STATE_PAGER 0x00000400 /* pager is available */
#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
* back to initial cpu */
-#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
diff --git a/kernel/events/core.c b/kernel/events/core.c
index faf073d0287f..110b38a58493 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2249,7 +2249,7 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- bool activate = true;
+ bool reprogram = true;
int ret = 0;
raw_spin_lock(&cpuctx->ctx.lock);
@@ -2257,27 +2257,26 @@ static int __perf_install_in_context(void *info)
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
- /* If we're on the wrong CPU, try again */
- if (task_cpu(ctx->task) != smp_processor_id()) {
- ret = -ESRCH;
- goto unlock;
- }
+ reprogram = (ctx->task == current);
/*
- * If we're on the right CPU, see if the task we target is
- * current, if not we don't have to activate the ctx, a future
- * context switch will do that for us.
+ * If the task is running, it must be running on this CPU,
+ * otherwise we cannot reprogram things.
+ *
+ * If its not running, we don't care, ctx->lock will
+ * serialize against it becoming runnable.
*/
- if (ctx->task != current)
- activate = false;
- else
- WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
+ if (task_curr(ctx->task) && !reprogram) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+ WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
} else if (task_ctx) {
raw_spin_lock(&task_ctx->lock);
}
- if (activate) {
+ if (reprogram) {
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx);
@@ -2328,13 +2327,36 @@ perf_install_in_context(struct perf_event_context *ctx,
/*
* Installing events is tricky because we cannot rely on ctx->is_active
* to be set in case this is the nr_events 0 -> 1 transition.
+ *
+ * Instead we use task_curr(), which tells us if the task is running.
+ * However, since we use task_curr() outside of rq::lock, we can race
+ * against the actual state. This means the result can be wrong.
+ *
+ * If we get a false positive, we retry, this is harmless.
+ *
+ * If we get a false negative, things are complicated. If we are after
+ * perf_event_context_sched_in() ctx::lock will serialize us, and the
+ * value must be correct. If we're before, it doesn't matter since
+ * perf_event_context_sched_in() will program the counter.
+ *
+ * However, this hinges on the remote context switch having observed
+ * our task->perf_event_ctxp[] store, such that it will in fact take
+ * ctx::lock in perf_event_context_sched_in().
+ *
+ * We do this by task_function_call(), if the IPI fails to hit the task
+ * we know any future context switch of task must see the
+ * perf_event_ctpx[] store.
*/
-again:
+
/*
- * Cannot use task_function_call() because we need to run on the task's
- * CPU regardless of whether its current or not.
+ * This smp_mb() orders the task->perf_event_ctxp[] store with the
+ * task_cpu() load, such that if the IPI then does not find the task
+ * running, a future context switch of that task must observe the
+ * store.
*/
- if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+ smp_mb();
+again:
+ if (!task_function_call(task, __perf_install_in_context, event))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -2348,12 +2370,16 @@ again:
raw_spin_unlock_irq(&ctx->lock);
return;
}
- raw_spin_unlock_irq(&ctx->lock);
/*
- * Since !ctx->is_active doesn't mean anything, we must IPI
- * unconditionally.
+ * If the task is not running, ctx->lock will avoid it becoming so,
+ * thus we can safely install the event.
*/
- goto again;
+ if (task_curr(task)) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto again;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
}
/*
@@ -6698,7 +6724,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
struct file *file, unsigned long offset,
unsigned long size)
{
- if (filter->inode != file->f_inode)
+ if (filter->inode != file_inode(file))
return false;
if (filter->offset > offset + size)
@@ -7034,25 +7060,12 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
-/*
- * Generic event overflow handling, sampling.
- */
-
-static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+static int
+__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
- int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
- u64 seq;
int ret = 0;
-
- /*
- * Non-sampling counters might still use the PMI to fold short
- * hardware counters, ignore those.
- */
- if (unlikely(!is_sampling_event(event)))
- return 0;
+ u64 seq;
seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
@@ -7080,6 +7093,34 @@ static int __perf_event_overflow(struct perf_event *event,
perf_adjust_period(event, delta, hwc->last_period, true);
}
+ return ret;
+}
+
+int perf_event_account_interrupt(struct perf_event *event)
+{
+ return __perf_event_account_interrupt(event, 1);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int events = atomic_read(&event->event_limit);
+ int ret = 0;
+
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ ret = __perf_event_account_interrupt(event, throttle);
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
@@ -9503,6 +9544,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
+/*
+ * Variation on perf_event_ctx_lock_nested(), except we take two context
+ * mutexes.
+ */
+static struct perf_event_context *
+__perf_event_ctx_lock_double(struct perf_event *group_leader,
+ struct perf_event_context *ctx)
+{
+ struct perf_event_context *gctx;
+
+again:
+ rcu_read_lock();
+ gctx = READ_ONCE(group_leader->ctx);
+ if (!atomic_inc_not_zero(&gctx->refcount)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+ if (group_leader->ctx != gctx) {
+ mutex_unlock(&ctx->mutex);
+ mutex_unlock(&gctx->mutex);
+ put_ctx(gctx);
+ goto again;
+ }
+
+ return gctx;
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -9746,12 +9818,31 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (move_group) {
- gctx = group_leader->ctx;
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ gctx = __perf_event_ctx_lock_double(group_leader, ctx);
+
if (gctx->task == TASK_TOMBSTONE) {
err = -ESRCH;
goto err_locked;
}
+
+ /*
+ * Check if we raced against another sys_perf_event_open() call
+ * moving the software group underneath us.
+ */
+ if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * If someone moved the group out from under us, check
+ * if this new event wound up on the same ctx, if so
+ * its the regular !move_group case, otherwise fail.
+ */
+ if (gctx != ctx) {
+ err = -EINVAL;
+ goto err_locked;
+ } else {
+ perf_event_ctx_unlock(group_leader, gctx);
+ move_group = 0;
+ }
+ }
} else {
mutex_lock(&ctx->mutex);
}
@@ -9853,7 +9944,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_unpin_context(ctx);
if (move_group)
- mutex_unlock(&gctx->mutex);
+ perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
@@ -9879,7 +9970,7 @@ SYSCALL_DEFINE5(perf_event_open,
err_locked:
if (move_group)
- mutex_unlock(&gctx->mutex);
+ perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
/* err_file: */
fput(event_file);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f9ec9add2164..d416f3baf392 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
retry:
/* Read the page with vaddr into memory */
ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
- &vma);
+ &vma, NULL);
if (ret <= 0)
return ret;
@@ -1194,7 +1194,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
atomic_set(&area->slot_count, 1);
- copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
if (!xol_add_vma(mm, area))
return area;
@@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* essentially a kernel access to the memory.
*/
result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
- NULL);
+ NULL, NULL);
if (result < 0)
return result;
diff --git a/kernel/exit.c b/kernel/exit.c
index aacff8e2aec0..8f14b866f9f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -56,7 +56,7 @@
#include <linux/kcov.h>
#include <linux/random.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
diff --git a/kernel/extable.c b/kernel/extable.c
index e820ccee9846..e3beec4a2339 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -22,7 +22,7 @@
#include <linux/init.h>
#include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* mutex protecting text section modification (dynamic code patching).
diff --git a/kernel/fork.c b/kernel/fork.c
index a439ac429669..11c5c8ab827c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -79,7 +79,7 @@
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -747,7 +747,8 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ struct user_namespace *user_ns)
{
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
@@ -787,6 +788,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
if (init_new_context(p, mm))
goto fail_nocontext;
+ mm->user_ns = get_user_ns(user_ns);
return mm;
fail_nocontext:
@@ -832,7 +834,7 @@ struct mm_struct *mm_alloc(void)
return NULL;
memset(mm, 0, sizeof(*mm));
- return mm_init(mm, current);
+ return mm_init(mm, current, current_user_ns());
}
/*
@@ -847,6 +849,7 @@ void __mmdrop(struct mm_struct *mm)
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
+ put_user_ns(mm->user_ns);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1128,7 +1131,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
memcpy(mm, oldmm, sizeof(*mm));
- if (!mm_init(mm, tsk))
+ if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;
err = dup_mmap(mm, oldmm);
diff --git a/kernel/futex.c b/kernel/futex.c
index 9246d9f593d1..0842c8ca534b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2459,7 +2459,7 @@ retry:
restart->fn = futex_wait_restart;
restart->futex.uaddr = uaddr;
restart->futex.val = val;
- restart->futex.time = abs_time->tv64;
+ restart->futex.time = *abs_time;
restart->futex.bitset = bitset;
restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
@@ -2480,7 +2480,7 @@ static long futex_wait_restart(struct restart_block *restart)
ktime_t t, *tp = NULL;
if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
- t.tv64 = restart->futex.time;
+ t = restart->futex.time;
tp = &t;
}
restart->fn = do_no_restart_syscall;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 4ae3232e7a28..3f409968e466 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -13,7 +13,7 @@
#include <linux/ptrace.h>
#include <linux/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
diff --git a/kernel/groups.c b/kernel/groups.c
index 2fcadd66a8fd..8dd7a61b7115 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -8,7 +8,7 @@
#include <linux/syscalls.h>
#include <linux/user_namespace.h>
#include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
struct group_info *groups_alloc(int gidsetsize)
{
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 9be9bda7c1f9..4544b115f5eb 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -37,10 +37,10 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
{
- int n, nodes;
+ int n, nodes = 0;
/* Calculate the number of nodes in the supplied affinity mask */
- for (n = 0, nodes = 0; n < num_online_nodes(); n++) {
+ for_each_online_node(n) {
if (cpumask_intersects(mask, cpumask_of_node(n))) {
node_set(n, *nodemsk);
nodes++;
@@ -82,7 +82,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk);
/*
- * If the number of nodes in the mask is less than or equal the
+ * If the number of nodes in the mask is greater than or equal the
* number of vectors we just spread the vectors across the nodes.
*/
if (affv <= nodes) {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 93ad6c1fb9b6..a9b8cf500591 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -182,6 +182,13 @@ void static_key_slow_dec_deferred(struct static_key_deferred *key)
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+void static_key_deferred_flush(struct static_key_deferred *key)
+{
+ STATIC_KEY_CHECK_USE();
+ flush_delayed_work(&key->work);
+}
+EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3cbb0c879705..85e5546cd791 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,11 +1,16 @@
#define pr_fmt(fmt) "kcov: " fmt
#define DISABLE_BRANCH_PROFILING
+#include <linux/atomic.h>
#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/export.h>
#include <linux/types.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/preempt.h>
#include <linux/printk.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -14,6 +19,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kcov.h>
+#include <asm/setup.h>
/*
* kcov descriptor (one per opened debugfs file).
@@ -68,6 +74,11 @@ void notrace __sanitizer_cov_trace_pc(void)
if (mode == KCOV_MODE_TRACE) {
unsigned long *area;
unsigned long pos;
+ unsigned long ip = _RET_IP_;
+
+#ifdef CONFIG_RANDOMIZE_BASE
+ ip -= kaslr_offset();
+#endif
/*
* There is some code that runs in interrupts but for which
@@ -81,7 +92,7 @@ void notrace __sanitizer_cov_trace_pc(void)
/* The first word is number of subsequent PCs. */
pos = READ_ONCE(area[0]) + 1;
if (likely(pos < t->kcov_size)) {
- area[pos] = _RET_IP_;
+ area[pos] = ip;
WRITE_ONCE(area[0], pos);
}
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 561675589511..5617cc412444 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
while (hole_end <= crashk_res.end) {
unsigned long i;
+ cond_resched();
+
if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
break;
/* See if I overlap any of the segments */
@@ -1467,9 +1469,6 @@ static int __init crash_save_vmcoreinfo_init(void)
#endif
VMCOREINFO_NUMBER(PG_head_mask);
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_X86
- VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
-#endif
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
#endif
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 037c321c5618..b56a558e406d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -19,6 +19,7 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/fs.h>
+#include <linux/ima.h>
#include <crypto/hash.h>
#include <crypto/sha.h>
#include <linux/syscalls.h>
@@ -132,6 +133,9 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
return ret;
image->kernel_buf_len = size;
+ /* IMA needs to pass the measurement list to the next kernel. */
+ ima_add_kexec_buffer(image);
+
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
image->kernel_buf_len);
@@ -428,25 +432,65 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
return locate_mem_hole_bottom_up(start, end, kbuf);
}
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
+/**
+ * arch_kexec_walk_mem - call func(data) on free memory regions
+ * @kbuf: Context info for the search. Also passed to @func.
+ * @func: Function to call for each memory region.
+ *
+ * Return: The memory walk will stop when func returns a non-zero value
+ * and that value will be returned. If all free regions are visited without
+ * func returning non-zero, then zero will be returned.
+ */
+int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
+ int (*func)(u64, u64, void *))
+{
+ if (kbuf->image->type == KEXEC_TYPE_CRASH)
+ return walk_iomem_res_desc(crashk_res.desc,
+ IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end,
+ kbuf, func);
+ else
+ return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
+}
+
+/**
+ * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
+ * @kbuf: Parameters for the memory search.
+ *
+ * On success, kbuf->mem will have the start address of the memory region found.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+ int ret;
+
+ ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback);
+
+ return ret == 1 ? 0 : -EADDRNOTAVAIL;
+}
+
+/**
+ * kexec_add_buffer - place a buffer in a kexec segment
+ * @kbuf: Buffer contents and memory parameters.
+ *
+ * This function assumes that kexec_mutex is held.
+ * On successful return, @kbuf->mem will have the physical address of
+ * the buffer in memory.
+ *
+ * Return: 0 on success, negative errno on error.
*/
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
- unsigned long memsz, unsigned long buf_align,
- unsigned long buf_min, unsigned long buf_max,
- bool top_down, unsigned long *load_addr)
+int kexec_add_buffer(struct kexec_buf *kbuf)
{
struct kexec_segment *ksegment;
- struct kexec_buf buf, *kbuf;
int ret;
/* Currently adding segment this way is allowed only in file mode */
- if (!image->file_mode)
+ if (!kbuf->image->file_mode)
return -EINVAL;
- if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+ if (kbuf->image->nr_segments >= KEXEC_SEGMENT_MAX)
return -EINVAL;
/*
@@ -456,45 +500,27 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
* logic goes through list of segments to make sure there are
* no destination overlaps.
*/
- if (!list_empty(&image->control_pages)) {
+ if (!list_empty(&kbuf->image->control_pages)) {
WARN_ON(1);
return -EINVAL;
}
- memset(&buf, 0, sizeof(struct kexec_buf));
- kbuf = &buf;
- kbuf->image = image;
- kbuf->buffer = buffer;
- kbuf->bufsz = bufsz;
-
- kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
- kbuf->buf_align = max(buf_align, PAGE_SIZE);
- kbuf->buf_min = buf_min;
- kbuf->buf_max = buf_max;
- kbuf->top_down = top_down;
+ /* Ensure minimum alignment needed for segments. */
+ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
+ kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
/* Walk the RAM ranges and allocate a suitable range for the buffer */
- if (image->type == KEXEC_TYPE_CRASH)
- ret = walk_iomem_res_desc(crashk_res.desc,
- IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
- crashk_res.start, crashk_res.end, kbuf,
- locate_mem_hole_callback);
- else
- ret = walk_system_ram_res(0, -1, kbuf,
- locate_mem_hole_callback);
- if (ret != 1) {
- /* A suitable memory range could not be found for buffer */
- return -EADDRNOTAVAIL;
- }
+ ret = kexec_locate_mem_hole(kbuf);
+ if (ret)
+ return ret;
/* Found a suitable memory range */
- ksegment = &image->segment[image->nr_segments];
+ ksegment = &kbuf->image->segment[kbuf->image->nr_segments];
ksegment->kbuf = kbuf->buffer;
ksegment->bufsz = kbuf->bufsz;
ksegment->mem = kbuf->mem;
ksegment->memsz = kbuf->memsz;
- image->nr_segments++;
- *load_addr = ksegment->mem;
+ kbuf->image->nr_segments++;
return 0;
}
@@ -616,13 +642,15 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
unsigned long max, int top_down)
{
struct purgatory_info *pi = &image->purgatory_info;
- unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
- unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+ unsigned long align, bss_align, bss_sz, bss_pad;
+ unsigned long entry, load_addr, curr_load_addr, bss_addr, offset;
unsigned char *buf_addr, *src;
int i, ret = 0, entry_sidx = -1;
const Elf_Shdr *sechdrs_c;
Elf_Shdr *sechdrs = NULL;
- void *purgatory_buf = NULL;
+ struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1,
+ .buf_min = min, .buf_max = max,
+ .top_down = top_down };
/*
* sechdrs_c points to section headers in purgatory and are read
@@ -688,9 +716,7 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
}
/* Determine how much memory is needed to load relocatable object. */
- buf_align = 1;
bss_align = 1;
- buf_sz = 0;
bss_sz = 0;
for (i = 0; i < pi->ehdr->e_shnum; i++) {
@@ -699,10 +725,10 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
align = sechdrs[i].sh_addralign;
if (sechdrs[i].sh_type != SHT_NOBITS) {
- if (buf_align < align)
- buf_align = align;
- buf_sz = ALIGN(buf_sz, align);
- buf_sz += sechdrs[i].sh_size;
+ if (kbuf.buf_align < align)
+ kbuf.buf_align = align;
+ kbuf.bufsz = ALIGN(kbuf.bufsz, align);
+ kbuf.bufsz += sechdrs[i].sh_size;
} else {
/* bss section */
if (bss_align < align)
@@ -714,32 +740,31 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
/* Determine the bss padding required to align bss properly */
bss_pad = 0;
- if (buf_sz & (bss_align - 1))
- bss_pad = bss_align - (buf_sz & (bss_align - 1));
+ if (kbuf.bufsz & (bss_align - 1))
+ bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1));
- memsz = buf_sz + bss_pad + bss_sz;
+ kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz;
/* Allocate buffer for purgatory */
- purgatory_buf = vzalloc(buf_sz);
- if (!purgatory_buf) {
+ kbuf.buffer = vzalloc(kbuf.bufsz);
+ if (!kbuf.buffer) {
ret = -ENOMEM;
goto out;
}
- if (buf_align < bss_align)
- buf_align = bss_align;
+ if (kbuf.buf_align < bss_align)
+ kbuf.buf_align = bss_align;
/* Add buffer to segment list */
- ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
- buf_align, min, max, top_down,
- &pi->purgatory_load_addr);
+ ret = kexec_add_buffer(&kbuf);
if (ret)
goto out;
+ pi->purgatory_load_addr = kbuf.mem;
/* Load SHF_ALLOC sections */
- buf_addr = purgatory_buf;
+ buf_addr = kbuf.buffer;
load_addr = curr_load_addr = pi->purgatory_load_addr;
- bss_addr = load_addr + buf_sz + bss_pad;
+ bss_addr = load_addr + kbuf.bufsz + bss_pad;
for (i = 0; i < pi->ehdr->e_shnum; i++) {
if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -785,11 +810,11 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
* Used later to identify which section is purgatory and skip it
* from checksumming.
*/
- pi->purgatory_buf = purgatory_buf;
+ pi->purgatory_buf = kbuf.buffer;
return ret;
out:
vfree(sechdrs);
- vfree(purgatory_buf);
+ vfree(kbuf.buffer);
return ret;
}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 0a52315d9c62..4cef7e4706b0 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -20,22 +20,6 @@ struct kexec_sha_region {
unsigned long len;
};
-/*
- * Keeps track of buffer parameters as provided by caller for requesting
- * memory placement of buffer.
- */
-struct kexec_buf {
- struct kimage *image;
- char *buffer;
- unsigned long bufsz;
- unsigned long mem;
- unsigned long memsz;
- unsigned long buf_align;
- unsigned long buf_min;
- unsigned long buf_max;
- bool top_down; /* allocate from top of memory hole */
-};
-
void kimage_file_post_load_cleanup(struct kimage *image);
#else /* CONFIG_KEXEC_FILE */
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0277d1216f80..d45c96073afb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -39,7 +39,7 @@
#include <linux/rwsem.h>
#include <linux/ptrace.h>
#include <linux/async.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <trace/events/module.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d63095472ea9..43460104f119 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -52,7 +52,7 @@
#include <asm/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7bd265f6b098..7c38f8f3d97b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3191,7 +3191,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
return 0;
}
-static int __lock_is_held(struct lockdep_map *lock);
+static int __lock_is_held(struct lockdep_map *lock, int read);
/*
* This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3332,7 +3332,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
chain_key = iterate_chain_key(chain_key, class_idx);
- if (nest_lock && !__lock_is_held(nest_lock))
+ if (nest_lock && !__lock_is_held(nest_lock, -1))
return print_lock_nested_lock_not_held(curr, hlock, ip);
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
@@ -3579,7 +3579,7 @@ found_it:
return 1;
}
-static int __lock_is_held(struct lockdep_map *lock)
+static int __lock_is_held(struct lockdep_map *lock, int read)
{
struct task_struct *curr = current;
int i;
@@ -3587,8 +3587,12 @@ static int __lock_is_held(struct lockdep_map *lock)
for (i = 0; i < curr->lockdep_depth; i++) {
struct held_lock *hlock = curr->held_locks + i;
- if (match_held_lock(hlock, lock))
- return 1;
+ if (match_held_lock(hlock, lock)) {
+ if (read == -1 || hlock->read == read)
+ return 1;
+
+ return 0;
+ }
}
return 0;
@@ -3772,7 +3776,7 @@ void lock_release(struct lockdep_map *lock, int nested,
}
EXPORT_SYMBOL_GPL(lock_release);
-int lock_is_held(struct lockdep_map *lock)
+int lock_is_held_type(struct lockdep_map *lock, int read)
{
unsigned long flags;
int ret = 0;
@@ -3784,13 +3788,13 @@ int lock_is_held(struct lockdep_map *lock)
check_flags(flags);
current->lockdep_recursion = 1;
- ret = __lock_is_held(lock);
+ ret = __lock_is_held(lock, read);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
return ret;
}
-EXPORT_SYMBOL_GPL(lock_is_held);
+EXPORT_SYMBOL_GPL(lock_is_held_type);
struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
{
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index a0f61effad25..6d1fcc786081 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -18,7 +18,7 @@
#include <linux/debug_locks.h>
#include <linux/vmalloc.h>
#include <linux/sort.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/div64.h>
#include "lockdep_internals.h"
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index eb0a599fcf58..e852be4851fc 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -108,11 +108,7 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
/*
* Get the counter ID stored in file->f_inode->i_private
*/
- if (!file->f_inode) {
- WARN_ON_ONCE(1);
- return -EBADF;
- }
- counter = (long)(file->f_inode->i_private);
+ counter = (long)file_inode(file)->i_private;
if (counter >= qstat_num)
return -EBADF;
@@ -177,11 +173,7 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf,
/*
* Get the counter ID stored in file->f_inode->i_private
*/
- if (!file->f_inode) {
- WARN_ON_ONCE(1);
- return -EBADF;
- }
- if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+ if ((long)file_inode(file)->i_private != qstat_reset_cnts)
return count;
for_each_possible_cpu(cpu) {
diff --git a/kernel/memremap.c b/kernel/memremap.c
index b501e390bb34..9ecedc28b928 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -246,7 +246,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ mem_hotplug_begin();
arch_remove_memory(align_start, align_size);
+ mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
pgmap_radix_release(res);
dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
@@ -358,7 +360,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (error)
goto err_pfn_remap;
+ mem_hotplug_begin();
error = arch_add_memory(nid, align_start, align_size, true);
+ mem_hotplug_done();
if (error)
goto err_add_memory;
diff --git a/kernel/module.c b/kernel/module.c
index 0e54d5bf0097..38d4270925d4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -46,7 +46,7 @@
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <linux/license.h>
@@ -313,8 +313,11 @@ struct load_info {
} index;
};
-/* We require a truly strong try_module_get(): 0 means failure due to
- ongoing or failed initialization etc. */
+/*
+ * We require a truly strong try_module_get(): 0 means success.
+ * Otherwise an error is returned due to ongoing or failed
+ * initialization etc.
+ */
static inline int strong_try_module_get(struct module *mod)
{
BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
@@ -330,7 +333,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
enum lockdep_ok lockdep_ok)
{
add_taint(flag, lockdep_ok);
- mod->taints |= (1U << flag);
+ set_bit(flag, &mod->taints);
}
/*
@@ -1138,24 +1141,13 @@ static inline int module_unload_init(struct module *mod)
static size_t module_flags_taint(struct module *mod, char *buf)
{
size_t l = 0;
+ int i;
+
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ if (taint_flags[i].module && test_bit(i, &mod->taints))
+ buf[l++] = taint_flags[i].c_true;
+ }
- if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
- buf[l++] = 'P';
- if (mod->taints & (1 << TAINT_OOT_MODULE))
- buf[l++] = 'O';
- if (mod->taints & (1 << TAINT_FORCED_MODULE))
- buf[l++] = 'F';
- if (mod->taints & (1 << TAINT_CRAP))
- buf[l++] = 'C';
- if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
- buf[l++] = 'E';
- if (mod->taints & (1 << TAINT_LIVEPATCH))
- buf[l++] = 'K';
- /*
- * TAINT_FORCED_RMMOD: could be added.
- * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
- * apply to modules.
- */
return l;
}
@@ -1911,6 +1903,9 @@ static void frob_writable_data(const struct module_layout *layout,
/* livepatching wants to disable read-only so it can frob module. */
void module_disable_ro(const struct module *mod)
{
+ if (!rodata_enabled)
+ return;
+
frob_text(&mod->core_layout, set_memory_rw);
frob_rodata(&mod->core_layout, set_memory_rw);
frob_ro_after_init(&mod->core_layout, set_memory_rw);
@@ -1920,6 +1915,9 @@ void module_disable_ro(const struct module *mod)
void module_enable_ro(const struct module *mod, bool after_init)
{
+ if (!rodata_enabled)
+ return;
+
frob_text(&mod->core_layout, set_memory_ro);
frob_rodata(&mod->core_layout, set_memory_ro);
frob_text(&mod->init_layout, set_memory_ro);
@@ -1952,6 +1950,9 @@ void set_all_modules_text_rw(void)
{
struct module *mod;
+ if (!rodata_enabled)
+ return;
+
mutex_lock(&module_mutex);
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
@@ -1968,9 +1969,18 @@ void set_all_modules_text_ro(void)
{
struct module *mod;
+ if (!rodata_enabled)
+ return;
+
mutex_lock(&module_mutex);
list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
+ /*
+ * Ignore going modules since it's possible that ro
+ * protection has already been disabled, otherwise we'll
+ * run into protection faults at module deallocation.
+ */
+ if (mod->state == MODULE_STATE_UNFORMED ||
+ mod->state == MODULE_STATE_GOING)
continue;
frob_text(&mod->core_layout, set_memory_ro);
@@ -1981,10 +1991,12 @@ void set_all_modules_text_ro(void)
static void disable_ro_nx(const struct module_layout *layout)
{
- frob_text(layout, set_memory_rw);
- frob_rodata(layout, set_memory_rw);
+ if (rodata_enabled) {
+ frob_text(layout, set_memory_rw);
+ frob_rodata(layout, set_memory_rw);
+ frob_ro_after_init(layout, set_memory_rw);
+ }
frob_rodata(layout, set_memory_x);
- frob_ro_after_init(layout, set_memory_rw);
frob_ro_after_init(layout, set_memory_x);
frob_writable_data(layout, set_memory_x);
}
@@ -3709,6 +3721,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
sysfs_cleanup:
mod_sysfs_teardown(mod);
coming_cleanup:
+ mod->state = MODULE_STATE_GOING;
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
@@ -4042,6 +4055,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
}
#endif /* CONFIG_KALLSYMS */
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+
+/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
static char *module_flags(struct module *mod, char *buf)
{
int bx = 0;
@@ -4086,7 +4103,7 @@ static void m_stop(struct seq_file *m, void *p)
static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
- char buf[8];
+ char buf[MODULE_FLAGS_BUF_SIZE];
/* We always ignore unformed modules. */
if (mod->state == MODULE_STATE_UNFORMED)
@@ -4257,7 +4274,7 @@ EXPORT_SYMBOL_GPL(__module_text_address);
void print_modules(void)
{
struct module *mod;
- char buf[8];
+ char buf[MODULE_FLAGS_BUF_SIZE];
printk(KERN_DEFAULT "Modules linked in:");
/* Most callers should already have preempt disabled, but make sure */
diff --git a/kernel/padata.c b/kernel/padata.c
index 7848f0566403..05316c9f32da 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -64,15 +64,11 @@ static int padata_cpu_hash(struct parallel_data *pd)
static void padata_parallel_worker(struct work_struct *parallel_work)
{
struct padata_parallel_queue *pqueue;
- struct parallel_data *pd;
- struct padata_instance *pinst;
LIST_HEAD(local_list);
local_bh_disable();
pqueue = container_of(parallel_work,
struct padata_parallel_queue, work);
- pd = pqueue->pd;
- pinst = pd->pinst;
spin_lock(&pqueue->parallel.lock);
list_replace_init(&pqueue->parallel.list, &local_list);
diff --git a/kernel/panic.c b/kernel/panic.c
index e6480e20379e..901c4fb46002 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -298,30 +298,27 @@ void panic(const char *fmt, ...)
EXPORT_SYMBOL(panic);
-
-struct tnt {
- u8 bit;
- char true;
- char false;
-};
-
-static const struct tnt tnts[] = {
- { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
- { TAINT_FORCED_MODULE, 'F', ' ' },
- { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' },
- { TAINT_FORCED_RMMOD, 'R', ' ' },
- { TAINT_MACHINE_CHECK, 'M', ' ' },
- { TAINT_BAD_PAGE, 'B', ' ' },
- { TAINT_USER, 'U', ' ' },
- { TAINT_DIE, 'D', ' ' },
- { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
- { TAINT_WARN, 'W', ' ' },
- { TAINT_CRAP, 'C', ' ' },
- { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
- { TAINT_OOT_MODULE, 'O', ' ' },
- { TAINT_UNSIGNED_MODULE, 'E', ' ' },
- { TAINT_SOFTLOCKUP, 'L', ' ' },
- { TAINT_LIVEPATCH, 'K', ' ' },
+/*
+ * TAINT_FORCED_RMMOD could be a per-module flag but the module
+ * is being removed anyway.
+ */
+const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
+ { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */
+ { 'F', ' ', true }, /* TAINT_FORCED_MODULE */
+ { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */
+ { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */
+ { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */
+ { 'B', ' ', false }, /* TAINT_BAD_PAGE */
+ { 'U', ' ', false }, /* TAINT_USER */
+ { 'D', ' ', false }, /* TAINT_DIE */
+ { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */
+ { 'W', ' ', false }, /* TAINT_WARN */
+ { 'C', ' ', true }, /* TAINT_CRAP */
+ { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */
+ { 'O', ' ', true }, /* TAINT_OOT_MODULE */
+ { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */
+ { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */
+ { 'K', ' ', true }, /* TAINT_LIVEPATCH */
};
/**
@@ -348,17 +345,17 @@ static const struct tnt tnts[] = {
*/
const char *print_tainted(void)
{
- static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];
+ static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")];
if (tainted_mask) {
char *s;
int i;
s = buf + sprintf(buf, "Tainted: ");
- for (i = 0; i < ARRAY_SIZE(tnts); i++) {
- const struct tnt *t = &tnts[i];
- *s++ = test_bit(t->bit, &tainted_mask) ?
- t->true : t->false;
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ const struct taint_flag *t = &taint_flags[i];
+ *s++ = test_bit(i, &tainted_mask) ?
+ t->c_true : t->c_false;
}
*s = 0;
} else
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index df9e8e9e0be7..eef2ce968636 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -151,8 +151,12 @@ out:
static void delayed_free_pidns(struct rcu_head *p)
{
- kmem_cache_free(pid_ns_cachep,
- container_of(p, struct pid_namespace, rcu));
+ struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);
+
+ dec_pid_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+
+ kmem_cache_free(pid_ns_cachep, ns);
}
static void destroy_pid_namespace(struct pid_namespace *ns)
@@ -162,8 +166,6 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
ns_free_inum(&ns->ns);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
- dec_pid_namespaces(ns->ucounts);
- put_user_ns(ns->user_ns);
call_rcu(&ns->rcu, delayed_free_pidns);
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4f0f0604f1c4..2d8e2b227db8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -30,7 +30,7 @@
#include <linux/compiler.h>
#include <linux/ktime.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 35310b627388..22df9f7ff672 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -25,7 +25,7 @@
#include <linux/cpu.h>
#include <linux/freezer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "power.h"
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 577f2288d19f..8b2696420abb 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -46,7 +46,7 @@
#include <linux/ctype.h>
#include <linux/uio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/sections.h>
#define CREATE_TRACE_POINTS
@@ -356,7 +356,6 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static u32 syslog_idx;
-static enum log_flags syslog_prev;
static size_t syslog_partial;
/* index and sequence number of the first record stored in the buffer */
@@ -370,7 +369,6 @@ static u32 log_next_idx;
/* the next printk record to write to the console */
static u64 console_seq;
static u32 console_idx;
-static enum log_flags console_prev;
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
@@ -639,27 +637,15 @@ static void append_char(char **pp, char *e, char c)
}
static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg, u64 seq,
- enum log_flags prev_flags)
+ struct printk_log *msg, u64 seq)
{
u64 ts_usec = msg->ts_nsec;
- char cont = '-';
do_div(ts_usec, 1000);
- /*
- * If we couldn't merge continuation line fragments during the print,
- * export the stored flags to allow an optional external merge of the
- * records. Merging the records isn't always neccessarily correct, like
- * when we hit a race during printing. In most cases though, it produces
- * better readable output. 'c' in the record flags mark the first
- * fragment of a line, '+' the following.
- */
- if (msg->flags & LOG_CONT)
- cont = (prev_flags & LOG_CONT) ? '+' : 'c';
-
return scnprintf(buf, size, "%u,%llu,%llu,%c;",
- (msg->facility << 3) | msg->level, seq, ts_usec, cont);
+ (msg->facility << 3) | msg->level, seq, ts_usec,
+ msg->flags & LOG_CONT ? 'c' : '-');
}
static ssize_t msg_print_ext_body(char *buf, size_t size,
@@ -714,7 +700,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
struct devkmsg_user {
u64 seq;
u32 idx;
- enum log_flags prev;
struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
@@ -748,7 +733,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
return -ENOMEM;
buf[len] = '\0';
- if (copy_from_iter(buf, len, from) != len) {
+ if (!copy_from_iter_full(buf, len, from)) {
kfree(buf);
return -EFAULT;
}
@@ -824,12 +809,11 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
msg = log_from_idx(user->idx);
len = msg_print_ext_header(user->buf, sizeof(user->buf),
- msg, user->seq, user->prev);
+ msg, user->seq);
len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
log_dict(msg), msg->dict_len,
log_text(msg), msg->text_len);
- user->prev = msg->flags;
user->idx = log_next(user->idx);
user->seq++;
raw_spin_unlock_irq(&logbuf_lock);
@@ -1210,26 +1194,12 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
return len;
}
-static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
- bool syslog, char *buf, size_t size)
+static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size)
{
const char *text = log_text(msg);
size_t text_size = msg->text_len;
- bool prefix = true;
- bool newline = true;
size_t len = 0;
- if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
- prefix = false;
-
- if (msg->flags & LOG_CONT) {
- if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
- prefix = false;
-
- if (!(msg->flags & LOG_NEWLINE))
- newline = false;
- }
-
do {
const char *next = memchr(text, '\n', text_size);
size_t text_len;
@@ -1247,22 +1217,17 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
text_len + 1 >= size - len)
break;
- if (prefix)
- len += print_prefix(msg, syslog, buf + len);
+ len += print_prefix(msg, syslog, buf + len);
memcpy(buf + len, text, text_len);
len += text_len;
- if (next || newline)
- buf[len++] = '\n';
+ buf[len++] = '\n';
} else {
/* SYSLOG_ACTION_* buffer size only calculation */
- if (prefix)
- len += print_prefix(msg, syslog, NULL);
+ len += print_prefix(msg, syslog, NULL);
len += text_len;
- if (next || newline)
- len++;
+ len++;
}
- prefix = true;
text = next;
} while (text);
@@ -1288,7 +1253,6 @@ static int syslog_print(char __user *buf, int size)
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
syslog_idx = log_first_idx;
- syslog_prev = 0;
syslog_partial = 0;
}
if (syslog_seq == log_next_seq) {
@@ -1298,13 +1262,11 @@ static int syslog_print(char __user *buf, int size)
skip = syslog_partial;
msg = log_from_idx(syslog_idx);
- n = msg_print_text(msg, syslog_prev, true, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ n = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
syslog_idx = log_next(syslog_idx);
syslog_seq++;
- syslog_prev = msg->flags;
n -= syslog_partial;
syslog_partial = 0;
} else if (!len){
@@ -1347,7 +1309,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u64 next_seq;
u64 seq;
u32 idx;
- enum log_flags prev;
/*
* Find first record that fits, including all following records,
@@ -1355,12 +1316,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
*/
seq = clear_seq;
idx = clear_idx;
- prev = 0;
while (seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
- len += msg_print_text(msg, prev, true, NULL, 0);
- prev = msg->flags;
+ len += msg_print_text(msg, true, NULL, 0);
idx = log_next(idx);
seq++;
}
@@ -1368,12 +1327,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
/* move first record forward until length fits into the buffer */
seq = clear_seq;
idx = clear_idx;
- prev = 0;
while (len > size && seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
- len -= msg_print_text(msg, prev, true, NULL, 0);
- prev = msg->flags;
+ len -= msg_print_text(msg, true, NULL, 0);
idx = log_next(idx);
seq++;
}
@@ -1386,7 +1343,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
struct printk_log *msg = log_from_idx(idx);
int textlen;
- textlen = msg_print_text(msg, prev, true, text,
+ textlen = msg_print_text(msg, true, text,
LOG_LINE_MAX + PREFIX_MAX);
if (textlen < 0) {
len = textlen;
@@ -1394,7 +1351,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
}
idx = log_next(idx);
seq++;
- prev = msg->flags;
raw_spin_unlock_irq(&logbuf_lock);
if (copy_to_user(buf + len, text, textlen))
@@ -1407,7 +1363,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
/* messages are gone, move to next one */
seq = log_first_seq;
idx = log_first_idx;
- prev = 0;
}
}
}
@@ -1508,7 +1463,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
syslog_idx = log_first_idx;
- syslog_prev = 0;
syslog_partial = 0;
}
if (source == SYSLOG_FROM_PROC) {
@@ -1521,16 +1475,14 @@ int do_syslog(int type, char __user *buf, int len, int source)
} else {
u64 seq = syslog_seq;
u32 idx = syslog_idx;
- enum log_flags prev = syslog_prev;
error = 0;
while (seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
- error += msg_print_text(msg, prev, true, NULL, 0);
+ error += msg_print_text(msg, true, NULL, 0);
idx = log_next(idx);
seq++;
- prev = msg->flags;
}
error -= syslog_partial;
}
@@ -1631,46 +1583,25 @@ static inline void printk_delay(void)
static struct cont {
char buf[LOG_LINE_MAX];
size_t len; /* length == 0 means unused buffer */
- size_t cons; /* bytes written to console */
struct task_struct *owner; /* task of first print*/
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
u8 facility; /* log facility of first message */
enum log_flags flags; /* prefix, newline flags */
- bool flushed:1; /* buffer sealed and committed */
} cont;
static void cont_flush(void)
{
- if (cont.flushed)
- return;
if (cont.len == 0)
return;
- if (cont.cons) {
- /*
- * If a fragment of this line was directly flushed to the
- * console; wait for the console to pick up the rest of the
- * line. LOG_NOCONS suppresses a duplicated output.
- */
- log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS,
- cont.ts_nsec, NULL, 0, cont.buf, cont.len);
- cont.flushed = true;
- } else {
- /*
- * If no fragment of this line ever reached the console,
- * just submit it to the store and free the buffer.
- */
- log_store(cont.facility, cont.level, cont.flags, 0,
- NULL, 0, cont.buf, cont.len);
- cont.len = 0;
- }
+
+ log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec,
+ NULL, 0, cont.buf, cont.len);
+ cont.len = 0;
}
static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len)
{
- if (cont.len && cont.flushed)
- return false;
-
/*
* If ext consoles are present, flush and skip in-kernel
* continuation. See nr_ext_console_drivers definition. Also, if
@@ -1687,8 +1618,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
cont.owner = current;
cont.ts_nsec = local_clock();
cont.flags = flags;
- cont.cons = 0;
- cont.flushed = false;
}
memcpy(cont.buf + cont.len, text, len);
@@ -1707,34 +1636,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
return true;
}
-static size_t cont_print_text(char *text, size_t size)
-{
- size_t textlen = 0;
- size_t len;
-
- if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
- textlen += print_time(cont.ts_nsec, text);
- size -= textlen;
- }
-
- len = cont.len - cont.cons;
- if (len > 0) {
- if (len+1 > size)
- len = size-1;
- memcpy(text + textlen, cont.buf + cont.cons, len);
- textlen += len;
- cont.cons = cont.len;
- }
-
- if (cont.flushed) {
- if (cont.flags & LOG_NEWLINE)
- text[textlen++] = '\n';
- /* got everything, release buffer */
- cont.len = 0;
- }
- return textlen;
-}
-
static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
{
/*
@@ -1926,7 +1827,8 @@ int vprintk_default(const char *fmt, va_list args)
int r;
#ifdef CONFIG_KGDB_KDB
- if (unlikely(kdb_trap_printk)) {
+ /* Allow to pass printk() to kdb but avoid a recursion. */
+ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
return r;
}
@@ -1980,33 +1882,24 @@ static u64 syslog_seq;
static u32 syslog_idx;
static u64 console_seq;
static u32 console_idx;
-static enum log_flags syslog_prev;
static u64 log_first_seq;
static u32 log_first_idx;
static u64 log_next_seq;
-static enum log_flags console_prev;
-static struct cont {
- size_t len;
- size_t cons;
- u8 level;
- bool flushed:1;
-} cont;
static char *log_text(const struct printk_log *msg) { return NULL; }
static char *log_dict(const struct printk_log *msg) { return NULL; }
static struct printk_log *log_from_idx(u32 idx) { return NULL; }
static u32 log_next(u32 idx) { return 0; }
static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg, u64 seq,
- enum log_flags prev_flags) { return 0; }
+ struct printk_log *msg,
+ u64 seq) { return 0; }
static ssize_t msg_print_ext_body(char *buf, size_t size,
char *dict, size_t dict_len,
char *text, size_t text_len) { return 0; }
static void call_console_drivers(int level,
const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
+static size_t msg_print_text(const struct printk_log *msg,
bool syslog, char *buf, size_t size) { return 0; }
-static size_t cont_print_text(char *text, size_t size) { return 0; }
static bool suppress_message_printing(int level) { return false; }
/* Still needs to be defined for users */
@@ -2270,42 +2163,6 @@ static inline int can_use_console(void)
return cpu_online(raw_smp_processor_id()) || have_callable_console();
}
-static void console_cont_flush(char *text, size_t size)
-{
- unsigned long flags;
- size_t len;
-
- raw_spin_lock_irqsave(&logbuf_lock, flags);
-
- if (!cont.len)
- goto out;
-
- if (suppress_message_printing(cont.level)) {
- cont.cons = cont.len;
- if (cont.flushed)
- cont.len = 0;
- goto out;
- }
-
- /*
- * We still queue earlier records, likely because the console was
- * busy. The earlier ones need to be printed before this one, we
- * did not flush any fragment so far, so just let it queue up.
- */
- if (console_seq < log_next_seq && !cont.cons)
- goto out;
-
- len = cont_print_text(text, size);
- raw_spin_unlock(&logbuf_lock);
- stop_critical_timings();
- call_console_drivers(cont.level, NULL, 0, text, len);
- start_critical_timings();
- local_irq_restore(flags);
- return;
-out:
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-}
-
/**
* console_unlock - unlock the console system
*
@@ -2359,9 +2216,6 @@ again:
return;
}
- /* flush buffered message fragment immediately to console */
- console_cont_flush(text, sizeof(text));
-
for (;;) {
struct printk_log *msg;
size_t ext_len = 0;
@@ -2381,7 +2235,6 @@ again:
/* messages are gone, move to first one */
console_seq = log_first_seq;
console_idx = log_first_idx;
- console_prev = 0;
} else {
len = 0;
}
@@ -2391,8 +2244,7 @@ skip:
msg = log_from_idx(console_idx);
level = msg->level;
- if ((msg->flags & LOG_NOCONS) ||
- suppress_message_printing(level)) {
+ if (suppress_message_printing(level)) {
/*
* Skip record we have buffered and already printed
* directly to the console when we received it, and
@@ -2400,22 +2252,14 @@ skip:
*/
console_idx = log_next(console_idx);
console_seq++;
- /*
- * We will get here again when we register a new
- * CON_PRINTBUFFER console. Clear the flag so we
- * will properly dump everything later.
- */
- msg->flags &= ~LOG_NOCONS;
- console_prev = msg->flags;
goto skip;
}
- len += msg_print_text(msg, console_prev, false,
- text + len, sizeof(text) - len);
+ len += msg_print_text(msg, false, text + len, sizeof(text) - len);
if (nr_ext_console_drivers) {
ext_len = msg_print_ext_header(ext_text,
sizeof(ext_text),
- msg, console_seq, console_prev);
+ msg, console_seq);
ext_len += msg_print_ext_body(ext_text + ext_len,
sizeof(ext_text) - ext_len,
log_dict(msg), msg->dict_len,
@@ -2423,7 +2267,6 @@ skip:
}
console_idx = log_next(console_idx);
console_seq++;
- console_prev = msg->flags;
raw_spin_unlock(&logbuf_lock);
stop_critical_timings(); /* don't trace print latency */
@@ -2718,7 +2561,6 @@ void register_console(struct console *newcon)
raw_spin_lock_irqsave(&logbuf_lock, flags);
console_seq = syslog_seq;
console_idx = syslog_idx;
- console_prev = syslog_prev;
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
/*
* We're about to replay the log buffer. Only do this to the
@@ -3074,7 +2916,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
goto out;
msg = log_from_idx(dumper->cur_idx);
- l = msg_print_text(msg, 0, syslog, line, size);
+ l = msg_print_text(msg, syslog, line, size);
dumper->cur_idx = log_next(dumper->cur_idx);
dumper->cur_seq++;
@@ -3143,7 +2985,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
u32 idx;
u64 next_seq;
u32 next_idx;
- enum log_flags prev;
size_t l = 0;
bool ret = false;
@@ -3166,27 +3007,23 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
/* calculate length of entire buffer */
seq = dumper->cur_seq;
idx = dumper->cur_idx;
- prev = 0;
while (seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
- l += msg_print_text(msg, prev, true, NULL, 0);
+ l += msg_print_text(msg, true, NULL, 0);
idx = log_next(idx);
seq++;
- prev = msg->flags;
}
/* move first record forward until length fits into the buffer */
seq = dumper->cur_seq;
idx = dumper->cur_idx;
- prev = 0;
while (l > size && seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
- l -= msg_print_text(msg, prev, true, NULL, 0);
+ l -= msg_print_text(msg, true, NULL, 0);
idx = log_next(idx);
seq++;
- prev = msg->flags;
}
/* last message in next interation */
@@ -3197,10 +3034,9 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
while (seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
- l += msg_print_text(msg, prev, syslog, buf + l, size - l);
+ l += msg_print_text(msg, syslog, buf + l, size - l);
idx = log_next(idx);
seq++;
- prev = msg->flags;
}
dumper->next_seq = next_seq;
diff --git a/kernel/profile.c b/kernel/profile.c
index 2dbccf2d806c..f67ce0aa6bc4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -408,7 +408,7 @@ void profile_tick(int type)
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
{
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e6474f7272ec..49ba7c1ade9d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -27,6 +27,35 @@
#include <linux/cn_proc.h>
#include <linux/compat.h>
+/*
+ * Access another process' address space via ptrace.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ if (!tsk->ptrace ||
+ (current != tsk->parent) ||
+ ((get_dumpable(mm) != SUID_DUMP_USER) &&
+ !ptracer_capable(tsk, mm->user_ns))) {
+ mmput(mm);
+ return 0;
+ }
+
+ ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+ mmput(mm);
+
+ return ret;
+}
+
/*
* ptrace a task: make the debugger its new parent and
@@ -39,6 +68,9 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
BUG_ON(!list_empty(&child->ptrace_entry));
list_add(&child->ptrace_entry, &new_parent->ptraced);
child->parent = new_parent;
+ rcu_read_lock();
+ child->ptracer_cred = get_cred(__task_cred(new_parent));
+ rcu_read_unlock();
}
/**
@@ -71,12 +103,16 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
*/
void __ptrace_unlink(struct task_struct *child)
{
+ const struct cred *old_cred;
BUG_ON(!child->ptrace);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
+ old_cred = child->ptracer_cred;
+ child->ptracer_cred = NULL;
+ put_cred(old_cred);
spin_lock(&child->sighand->siglock);
child->ptrace = 0;
@@ -220,7 +256,7 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
- int dumpable = 0;
+ struct mm_struct *mm;
kuid_t caller_uid;
kgid_t caller_gid;
@@ -271,16 +307,11 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
return -EPERM;
ok:
rcu_read_unlock();
- smp_rmb();
- if (task->mm)
- dumpable = get_dumpable(task->mm);
- rcu_read_lock();
- if (dumpable != SUID_DUMP_USER &&
- !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
- rcu_read_unlock();
- return -EPERM;
- }
- rcu_read_unlock();
+ mm = task->mm;
+ if (mm &&
+ ((get_dumpable(mm) != SUID_DUMP_USER) &&
+ !ptrace_has_cap(mm->user_ns, mode)))
+ return -EPERM;
return security_ptrace_access_check(task, mode);
}
@@ -344,10 +375,6 @@ static int ptrace_attach(struct task_struct *task, long request,
if (seize)
flags |= PT_SEIZED;
- rcu_read_lock();
- if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
- flags |= PT_PTRACE_CAP;
- rcu_read_unlock();
task->ptrace = flags;
__ptrace_link(task, current);
@@ -537,7 +564,8 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
int this_len, retval;
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
- retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
+ retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE);
+
if (!retval) {
if (copied)
break;
@@ -564,7 +592,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
if (copy_from_user(buf, src, this_len))
return -EFAULT;
- retval = access_process_vm(tsk, dst, buf, this_len,
+ retval = ptrace_access_vm(tsk, dst, buf, this_len,
FOLL_FORCE | FOLL_WRITE);
if (!retval) {
if (copied)
@@ -1128,7 +1156,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
unsigned long tmp;
int copied;
- copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
+ copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
if (copied != sizeof(tmp))
return -EIO;
return put_user(tmp, (unsigned long __user *)data);
@@ -1139,7 +1167,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
{
int copied;
- copied = access_process_vm(tsk, addr, &data, sizeof(data),
+ copied = ptrace_access_vm(tsk, addr, &data, sizeof(data),
FOLL_FORCE | FOLL_WRITE);
return (copied == sizeof(data)) ? 0 : -EIO;
}
@@ -1157,7 +1185,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
switch (request) {
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
- ret = access_process_vm(child, addr, &word, sizeof(word),
+ ret = ptrace_access_vm(child, addr, &word, sizeof(word),
FOLL_FORCE);
if (ret != sizeof(word))
ret = -EIO;
@@ -1167,7 +1195,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
case PTRACE_POKETEXT:
case PTRACE_POKEDATA:
- ret = access_process_vm(child, addr, &data, sizeof(data),
+ ret = ptrace_access_vm(child, addr, &data, sizeof(data),
FOLL_FORCE | FOLL_WRITE);
ret = (ret != sizeof(data) ? -EIO : 0);
break;
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 80adef7d4c3d..0d6ff3e471be 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void);
#define TPS(x) tracepoint_string(x)
void rcu_early_boot_tests(void);
+void rcu_test_sync_prims(void);
/*
* This function really isn't for public consumption, but RCU is special in
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1898559e6b60..b23a4d076f3d 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -185,9 +185,6 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
* benefits of doing might_sleep() to reduce latency.)
*
* Cool, huh? (Due to Josh Triplett.)
- *
- * But we want to make this a static inline later. The cond_resched()
- * currently makes this problematic.
*/
void synchronize_sched(void)
{
@@ -195,7 +192,6 @@ void synchronize_sched(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_sched() in RCU read-side critical section");
- cond_resched();
}
EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 196f0302e2f4..c64b827ecbca 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
/*
* During boot, we forgive RCU lockdep issues. After this function is
- * invoked, we start taking RCU lockdep issues seriously.
+ * invoked, we start taking RCU lockdep issues seriously. Note that unlike
+ * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
+ * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
+ * The reason for this is that Tiny RCU does not need kthreads, so does
+ * not have to care about the fact that the scheduler is half-initialized
+ * at a certain phase of the boot process.
*/
void __init rcu_scheduler_starting(void)
{
WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
}
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 96c52e43f7ca..cb4e2056ccf3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
int sysctl_panic_on_rcu_stall __read_mostly;
/*
- * The rcu_scheduler_active variable transitions from zero to one just
- * before the first task is spawned. So when this variable is zero, RCU
- * can assume that there is but one task, allowing RCU to (for example)
+ * The rcu_scheduler_active variable is initialized to the value
+ * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
+ * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
+ * RCU can assume that there is but one task, allowing RCU to (for example)
* optimize synchronize_rcu() to a simple barrier(). When this variable
- * is one, RCU must actually do all the hard work required to detect real
- * grace periods. This variable is also used to suppress boot-time false
- * positives from lockdep-RCU error checking.
+ * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
+ * to detect real grace periods. This variable is also used to suppress
+ * boot-time false positives from lockdep-RCU error checking. Finally, it
+ * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
+ * is fully initialized, including all of its kthreads having been spawned.
*/
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
@@ -3980,18 +3983,22 @@ static int __init rcu_spawn_gp_kthread(void)
early_initcall(rcu_spawn_gp_kthread);
/*
- * This function is invoked towards the end of the scheduler's initialization
- * process. Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system). After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections. This function also enables RCU lockdep checking.
+ * This function is invoked towards the end of the scheduler's
+ * initialization process. Before this is called, the idle task might
+ * contain synchronous grace-period primitives (during which time, this idle
+ * task is booting the system, and such primitives are no-ops). After this
+ * function is called, any synchronous grace-period primitives are run as
+ * expedited, with the requesting task driving the grace period forward.
+ * A later core_initcall() rcu_exp_runtime_mode() will switch to full
+ * runtime RCU functionality.
*/
void rcu_scheduler_starting(void)
{
WARN_ON(num_online_cpus() != 1);
WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
+ rcu_test_sync_prims();
+ rcu_scheduler_active = RCU_SCHEDULER_INIT;
+ rcu_test_sync_prims();
}
/*
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d3053e99fdb6..e59e1849b89a 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -532,18 +532,28 @@ struct rcu_exp_work {
};
/*
+ * Common code to drive an expedited grace period forward, used by
+ * workqueues and mid-boot-time tasks.
+ */
+static void rcu_exp_sel_wait_wake(struct rcu_state *rsp,
+ smp_call_func_t func, unsigned long s)
+{
+ /* Initialize the rcu_node tree in preparation for the wait. */
+ sync_rcu_exp_select_cpus(rsp, func);
+
+ /* Wait and clean up, including waking everyone. */
+ rcu_exp_wait_wake(rsp, s);
+}
+
+/*
* Work-queue handler to drive an expedited grace period forward.
*/
static void wait_rcu_exp_gp(struct work_struct *wp)
{
struct rcu_exp_work *rewp;
- /* Initialize the rcu_node tree in preparation for the wait. */
rewp = container_of(wp, struct rcu_exp_work, rew_work);
- sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func);
-
- /* Wait and clean up, including waking everyone. */
- rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
+ rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s);
}
/*
@@ -569,12 +579,18 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
if (exp_funnel_lock(rsp, s))
return; /* Someone else did our work for us. */
- /* Marshall arguments and schedule the expedited grace period. */
- rew.rew_func = func;
- rew.rew_rsp = rsp;
- rew.rew_s = s;
- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- schedule_work(&rew.rew_work);
+ /* Ensure that load happens before action based on it. */
+ if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
+ /* Direct call during scheduler init and early_initcalls(). */
+ rcu_exp_sel_wait_wake(rsp, func, s);
+ } else {
+ /* Marshall arguments & schedule the expedited grace period. */
+ rew.rew_func = func;
+ rew.rew_rsp = rsp;
+ rew.rew_s = s;
+ INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+ schedule_work(&rew.rew_work);
+ }
/* Wait for expedited grace period to complete. */
rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
@@ -676,6 +692,8 @@ void synchronize_rcu_expedited(void)
{
struct rcu_state *rsp = rcu_state_p;
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
_synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -693,3 +711,15 @@ void synchronize_rcu_expedited(void)
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+/*
+ * Switch to run-time mode once Tree RCU has fully initialized.
+ */
+static int __init rcu_exp_runtime_mode(void)
+{
+ rcu_test_sync_prims();
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+ rcu_test_sync_prims();
+ return 0;
+}
+core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 85c5a883c6e3..56583e764ebf 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -670,7 +670,7 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
- if (!rcu_scheduler_active)
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return;
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f19271dce0a9..4f6db7e6a117 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -121,11 +121,14 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
* Should expedited grace-period primitives always fall back to their
* non-expedited counterparts? Intended for use within RCU. Note
* that if the user specifies both rcu_expedited and rcu_normal, then
- * rcu_normal wins.
+ * rcu_normal wins. (Except during the time period during boot from
+ * when the first task is spawned until the rcu_exp_runtime_mode()
+ * core_initcall() is invoked, at which point everything is expedited.)
*/
bool rcu_gp_is_normal(void)
{
- return READ_ONCE(rcu_normal);
+ return READ_ONCE(rcu_normal) &&
+ rcu_scheduler_active != RCU_SCHEDULER_INIT;
}
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
@@ -135,13 +138,14 @@ static atomic_t rcu_expedited_nesting =
/*
* Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited
- * sysfs/boot variable into account as well as the rcu_expedite_gp()
- * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
- * returns false is a -really- bad idea.
+ * sysfs/boot variable and rcu_scheduler_active into account as well
+ * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp()
+ * until rcu_gp_is_expedited() returns false is a -really- bad idea.
*/
bool rcu_gp_is_expedited(void)
{
- return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
+ rcu_scheduler_active == RCU_SCHEDULER_INIT;
}
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
@@ -257,7 +261,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
int notrace debug_lockdep_rcu_enabled(void)
{
- return rcu_scheduler_active && debug_locks &&
+ return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -591,7 +595,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
void synchronize_rcu_tasks(void)
{
/* Complain if the scheduler has not started. */
- RCU_LOCKDEP_WARN(!rcu_scheduler_active,
+ RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
"synchronize_rcu_tasks called too soon");
/* Wait for the grace period. */
@@ -813,6 +817,23 @@ static void rcu_spawn_tasks_kthread(void)
#endif /* #ifdef CONFIG_TASKS_RCU */
+/*
+ * Test each non-SRCU synchronous grace-period wait API. This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
+ */
+void rcu_test_sync_prims(void)
+{
+ if (!IS_ENABLED(CONFIG_PROVE_RCU))
+ return;
+ synchronize_rcu();
+ synchronize_rcu_bh();
+ synchronize_sched();
+ synchronize_rcu_expedited();
+ synchronize_rcu_bh_expedited();
+ synchronize_sched_expedited();
+}
+
#ifdef CONFIG_PROVE_RCU
/*
@@ -865,6 +886,7 @@ void rcu_early_boot_tests(void)
early_boot_test_call_rcu_bh();
if (rcu_self_test_sched)
early_boot_test_call_rcu_sched();
+ rcu_test_sync_prims();
}
static int rcu_verify_early_boot_tests(void)
diff --git a/kernel/relay.c b/kernel/relay.c
index da79a109dbeb..8f18d314a96a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -809,11 +809,11 @@ void relay_subbufs_consumed(struct rchan *chan,
{
struct rchan_buf *buf;
- if (!chan)
+ if (!chan || cpu >= NR_CPUS)
return;
buf = *per_cpu_ptr(chan->buf, cpu);
- if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs)
+ if (!buf || subbufs_consumed > chan->n_subbufs)
return;
if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 966556ebdbb3..c56fb57f2991 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1456,7 +1456,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* yield - it could be a while.
*/
if (unlikely(queued)) {
- ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+ ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_hrtimeout(&to, HRTIMER_MODE_REL);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index bff9c774987a..f7ce79a46050 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -41,8 +41,7 @@
* outside of a lifetime-guarded section. In general, this
* is only needed for handling filters shared across tasks.
* @prev: points to a previously installed, or inherited, filter
- * @len: the number of instructions in the program
- * @insnsi: the BPF program instructions to evaluate
+ * @prog: the BPF program to evaluate
*
* seccomp_filter objects are organized in a tree linked via the @prev
* pointer. For any task, it appears to be a singly-linked list starting
@@ -168,8 +167,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
}
/**
- * seccomp_run_filters - evaluates all seccomp filters against @syscall
- * @syscall: number of the current system call
+ * seccomp_run_filters - evaluates all seccomp filters against @sd
+ * @sd: optional seccomp data to be passed to filters
*
* Returns valid seccomp BPF response codes.
*/
diff --git a/kernel/signal.c b/kernel/signal.c
index 29a410780aa9..3603d93a1968 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -39,7 +39,7 @@
#include <trace/events/signal.h>
#include <asm/param.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>
@@ -346,7 +346,7 @@ static bool task_participate_group_stop(struct task_struct *task)
* fresh group stop. Read comment in do_signal_stop() for details.
*/
if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
- sig->flags = SIGNAL_STOP_STOPPED;
+ signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
return true;
}
return false;
@@ -587,7 +587,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
struct hrtimer *tmr = &tsk->signal->real_timer;
if (!hrtimer_is_queued(tmr) &&
- tsk->signal->it_real_incr.tv64 != 0) {
+ tsk->signal->it_real_incr != 0) {
hrtimer_forward(tmr, tmr->base->get_time(),
tsk->signal->it_real_incr);
hrtimer_restart(tmr);
@@ -843,7 +843,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
* will take ->siglock, notice SIGNAL_CLD_MASK, and
* notify its parent. See get_signal_to_deliver().
*/
- signal->flags = why | SIGNAL_STOP_CONTINUED;
+ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
signal->group_stop_count = 0;
signal->group_exit_code = 0;
}
@@ -2491,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset)
{
struct task_struct *tsk = current;
+ /*
+ * In case the signal mask hasn't changed, there is nothing we need
+ * to do. The current->blocked shouldn't be modified by other task.
+ */
+ if (sigequalsets(&tsk->blocked, newset))
+ return;
+
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, newset);
spin_unlock_irq(&tsk->sighand->siglock);
@@ -2759,7 +2766,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
const struct timespec *ts)
{
- ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX };
+ ktime_t *to = NULL, timeout = KTIME_MAX;
struct task_struct *tsk = current;
sigset_t mask = *which;
int sig, ret = 0;
@@ -2779,7 +2786,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
spin_lock_irq(&tsk->sighand->siglock);
sig = dequeue_signal(tsk, &mask, info);
- if (!sig && timeout.tv64) {
+ if (!sig && timeout) {
/*
* None ready, temporarily unblock those we're interested
* while we are sleeping in so that we'll be awakened when
diff --git a/kernel/sys.c b/kernel/sys.c
index 9758892a2d09..842914ef7de4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -57,7 +57,7 @@
/* Move somewhere else to avoid recompiling? */
#include <generated/utsrelease.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/unistd.h>
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 635482e60ca3..8acef8576ce9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -150,6 +150,9 @@ cond_syscall(sys_io_destroy);
cond_syscall(sys_io_submit);
cond_syscall(sys_io_cancel);
cond_syscall(sys_io_getevents);
+cond_syscall(compat_sys_io_setup);
+cond_syscall(compat_sys_io_submit);
+cond_syscall(compat_sys_io_getevents);
cond_syscall(sys_sysfs);
cond_syscall(sys_syslog);
cond_syscall(sys_process_vm_readv);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 39b3368f6de6..8dbaec0e4f7f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -67,7 +67,7 @@
#include <linux/bpf.h>
#include <linux/mount.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#ifdef CONFIG_X86
@@ -627,7 +627,7 @@ static struct ctl_table kern_table[] = {
.data = &tracepoint_printk,
.maxlen = sizeof(tracepoint_printk),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = tracepoint_printk_sysctl,
},
#endif
#ifdef CONFIG_KEXEC_CORE
@@ -2389,9 +2389,11 @@ static void validate_coredump_safety(void)
#ifdef CONFIG_COREDUMP
if (suid_dumpable == SUID_DUMP_ROOT &&
core_pattern[0] != '/' && core_pattern[0] != '|') {
- printk(KERN_WARNING "Unsafe core_pattern used with "\
- "suid_dumpable=2. Pipe handler or fully qualified "\
- "core dump path required.\n");
+ printk(KERN_WARNING
+"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
+"Pipe handler or fully qualified core dump path required.\n"
+"Set kernel.core_pattern before fs.suid_dumpable.\n"
+ );
}
#endif
}
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 6eb99c17dbd8..ece4b177052b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
"warning: process `%s' used the deprecated sysctl "
"system call with ", current->comm);
for (i = 0; i < nlen; i++)
- printk("%d.", name[i]);
- printk("\n");
+ printk(KERN_CONT "%d.", name[i]);
+ printk(KERN_CONT "\n");
}
return;
}
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 9b08ca391aed..e6dc9a538efa 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -234,7 +234,7 @@ static int alarmtimer_suspend(struct device *dev)
min = freezer_delta;
expires = freezer_expires;
type = freezer_alarmtype;
- freezer_delta = ktime_set(0, 0);
+ freezer_delta = 0;
spin_unlock_irqrestore(&freezer_delta_lock, flags);
rtc = alarmtimer_get_rtcdev();
@@ -254,13 +254,13 @@ static int alarmtimer_suspend(struct device *dev)
if (!next)
continue;
delta = ktime_sub(next->expires, base->gettime());
- if (!min.tv64 || (delta.tv64 < min.tv64)) {
+ if (!min || (delta < min)) {
expires = next->expires;
min = delta;
type = i;
}
}
- if (min.tv64 == 0)
+ if (min == 0)
return 0;
if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
@@ -277,7 +277,7 @@ static int alarmtimer_suspend(struct device *dev)
now = ktime_add(now, min);
/* Set alarm, if in the past reject suspend briefly to handle */
- ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+ ret = rtc_timer_start(rtc, &rtctimer, now, 0);
if (ret < 0)
__pm_wakeup_event(ws, MSEC_PER_SEC);
return ret;
@@ -328,7 +328,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
delta = ktime_sub(absexp, base->gettime());
spin_lock_irqsave(&freezer_delta_lock, flags);
- if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) {
+ if (!freezer_delta || (delta < freezer_delta)) {
freezer_delta = delta;
freezer_expires = absexp;
freezer_alarmtype = type;
@@ -453,10 +453,10 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
delta = ktime_sub(now, alarm->node.expires);
- if (delta.tv64 < 0)
+ if (delta < 0)
return 0;
- if (unlikely(delta.tv64 >= interval.tv64)) {
+ if (unlikely(delta >= interval)) {
s64 incr = ktime_to_ns(interval);
overrun = ktime_divns(delta, incr);
@@ -464,7 +464,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
alarm->node.expires = ktime_add_ns(alarm->node.expires,
incr*overrun);
- if (alarm->node.expires.tv64 > now.tv64)
+ if (alarm->node.expires > now)
return overrun;
/*
* This (and the ktime_add() below) is the
@@ -516,12 +516,13 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
spin_lock_irqsave(&ptr->it_lock, flags);
if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
- if (posix_timer_event(ptr, 0) != 0)
+ if (IS_ENABLED(CONFIG_POSIX_TIMERS) &&
+ posix_timer_event(ptr, 0) != 0)
ptr->it_overrun++;
}
/* Re-add periodic timers */
- if (ptr->it.alarm.interval.tv64) {
+ if (ptr->it.alarm.interval) {
ptr->it_overrun += alarm_forward(alarm, now,
ptr->it.alarm.interval);
result = ALARMTIMER_RESTART;
@@ -729,7 +730,7 @@ static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
rem = ktime_sub(exp, alarm_bases[type].gettime());
- if (rem.tv64 <= 0)
+ if (rem <= 0)
return 0;
rmt = ktime_to_timespec(rem);
@@ -754,7 +755,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
struct alarm alarm;
int ret = 0;
- exp.tv64 = restart->nanosleep.expires;
+ exp = restart->nanosleep.expires;
alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
if (alarmtimer_do_nsleep(&alarm, exp))
@@ -834,7 +835,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
restart = &current->restart_block;
restart->fn = alarm_timer_nsleep_restart;
restart->nanosleep.clockid = type;
- restart->nanosleep.expires = exp.tv64;
+ restart->nanosleep.expires = exp;
restart->nanosleep.rmtp = rmtp;
ret = -ERESTART_RESTARTBLOCK;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 2c5bc77c0bb0..97ac0951f164 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -179,7 +179,7 @@ void clockevents_switch_state(struct clock_event_device *dev,
void clockevents_shutdown(struct clock_event_device *dev)
{
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
- dev->next_event.tv64 = KTIME_MAX;
+ dev->next_event = KTIME_MAX;
}
/**
@@ -213,7 +213,7 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
printk_deferred(KERN_WARNING
"CE: Reprogramming failure. Giving up\n");
- dev->next_event.tv64 = KTIME_MAX;
+ dev->next_event = KTIME_MAX;
return -ETIME;
}
@@ -310,7 +310,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
int64_t delta;
int rc;
- if (unlikely(expires.tv64 < 0)) {
+ if (unlikely(expires < 0)) {
WARN_ON_ONCE(1);
return -ETIME;
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 150242ccfcd2..665985b0a89a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -170,7 +170,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
static void clocksource_watchdog(unsigned long data)
{
struct clocksource *cs;
- cycle_t csnow, wdnow, cslast, wdlast, delta;
+ u64 csnow, wdnow, cslast, wdlast, delta;
int64_t wd_nsec, cs_nsec;
int next_cpu, reset_pending;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 08be5c99d26b..c6ecedd3b839 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -50,7 +50,7 @@
#include <linux/timer.h>
#include <linux/freezer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <trace/events/timer.h>
@@ -171,7 +171,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
return 0;
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
+ return expires <= new_base->cpu_base->expires_next;
#else
return 0;
#endif
@@ -313,7 +313,7 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
* We use KTIME_SEC_MAX here, the maximum timeout which we can
* return to user space in a timespec:
*/
- if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
+ if (res < 0 || res < lhs || res < rhs)
res = ktime_set(KTIME_SEC_MAX, 0);
return res;
@@ -465,8 +465,8 @@ static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
{
struct hrtimer_clock_base *base = cpu_base->clock_base;
- ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
unsigned int active = cpu_base->active_bases;
+ ktime_t expires, expires_next = KTIME_MAX;
hrtimer_update_next_timer(cpu_base, NULL);
for (; active; base++, active >>= 1) {
@@ -479,7 +479,7 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- if (expires.tv64 < expires_next.tv64) {
+ if (expires < expires_next) {
expires_next = expires;
hrtimer_update_next_timer(cpu_base, timer);
}
@@ -489,8 +489,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
* the clock bases so the result might be negative. Fix it up
* to prevent a false positive in clockevents_program_event().
*/
- if (expires_next.tv64 < 0)
- expires_next.tv64 = 0;
+ if (expires_next < 0)
+ expires_next = 0;
return expires_next;
}
#endif
@@ -561,10 +561,10 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
expires_next = __hrtimer_get_next_event(cpu_base);
- if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
+ if (skip_equal && expires_next == cpu_base->expires_next)
return;
- cpu_base->expires_next.tv64 = expires_next.tv64;
+ cpu_base->expires_next = expires_next;
/*
* If a hang was detected in the last timer interrupt then we
@@ -622,10 +622,10 @@ static void hrtimer_reprogram(struct hrtimer *timer,
* CLOCK_REALTIME timer might be requested with an absolute
* expiry time which is less than base->offset. Set it to 0.
*/
- if (expires.tv64 < 0)
- expires.tv64 = 0;
+ if (expires < 0)
+ expires = 0;
- if (expires.tv64 >= cpu_base->expires_next.tv64)
+ if (expires >= cpu_base->expires_next)
return;
/* Update the pointer to the next expiring timer */
@@ -653,7 +653,7 @@ static void hrtimer_reprogram(struct hrtimer *timer,
*/
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
- base->expires_next.tv64 = KTIME_MAX;
+ base->expires_next = KTIME_MAX;
base->hres_active = 0;
}
@@ -827,21 +827,21 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
delta = ktime_sub(now, hrtimer_get_expires(timer));
- if (delta.tv64 < 0)
+ if (delta < 0)
return 0;
if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
return 0;
- if (interval.tv64 < hrtimer_resolution)
- interval.tv64 = hrtimer_resolution;
+ if (interval < hrtimer_resolution)
+ interval = hrtimer_resolution;
- if (unlikely(delta.tv64 >= interval.tv64)) {
+ if (unlikely(delta >= interval)) {
s64 incr = ktime_to_ns(interval);
orun = ktime_divns(delta, incr);
hrtimer_add_expires_ns(timer, incr * orun);
- if (hrtimer_get_expires_tv64(timer) > now.tv64)
+ if (hrtimer_get_expires_tv64(timer) > now)
return orun;
/*
* This (and the ktime_add() below) is the
@@ -955,7 +955,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
*/
timer->is_rel = mode & HRTIMER_MODE_REL;
if (timer->is_rel)
- tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
+ tim = ktime_add_safe(tim, hrtimer_resolution);
#endif
return tim;
}
@@ -1104,7 +1104,7 @@ u64 hrtimer_get_next_event(void)
raw_spin_lock_irqsave(&cpu_base->lock, flags);
if (!__hrtimer_hres_active(cpu_base))
- expires = __hrtimer_get_next_event(cpu_base).tv64;
+ expires = __hrtimer_get_next_event(cpu_base);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1296,7 +1296,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
* are right-of a not yet expired timer, because that
* timer will have to trigger a wakeup anyway.
*/
- if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
+ if (basenow < hrtimer_get_softexpires_tv64(timer))
break;
__run_hrtimer(cpu_base, base, timer, &basenow);
@@ -1318,7 +1318,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
- dev->next_event.tv64 = KTIME_MAX;
+ dev->next_event = KTIME_MAX;
raw_spin_lock(&cpu_base->lock);
entry_time = now = hrtimer_update_base(cpu_base);
@@ -1331,7 +1331,7 @@ retry:
* timers which run their callback and need to be requeued on
* this CPU.
*/
- cpu_base->expires_next.tv64 = KTIME_MAX;
+ cpu_base->expires_next = KTIME_MAX;
__hrtimer_run_queues(cpu_base, now);
@@ -1379,13 +1379,13 @@ retry:
cpu_base->hang_detected = 1;
raw_spin_unlock(&cpu_base->lock);
delta = ktime_sub(now, entry_time);
- if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
- cpu_base->max_hang_time = (unsigned int) delta.tv64;
+ if ((unsigned int)delta > cpu_base->max_hang_time)
+ cpu_base->max_hang_time = (unsigned int) delta;
/*
* Limit it to a sensible value as we enforce a longer
* delay. Give the CPU at least 100ms to catch up.
*/
- if (delta.tv64 > 100 * NSEC_PER_MSEC)
+ if (delta > 100 * NSEC_PER_MSEC)
expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
else
expires_next = ktime_add(now, delta);
@@ -1495,7 +1495,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
ktime_t rem;
rem = hrtimer_expires_remaining(timer);
- if (rem.tv64 <= 0)
+ if (rem <= 0)
return 0;
rmt = ktime_to_timespec(rem);
@@ -1693,7 +1693,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
* Optimize when a zero timeout value is given. It does not
* matter whether this is an absolute or a relative time.
*/
- if (expires && !expires->tv64) {
+ if (expires && *expires == 0) {
__set_current_state(TASK_RUNNING);
return 0;
}
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 2b9f45bc955d..8c89143f9ebf 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -14,7 +14,7 @@
#include <linux/hrtimer.h>
#include <trace/events/timer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/**
* itimer_get_remtime - get remaining time for the timer
@@ -34,10 +34,10 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
* then we return 0 - which is correct.
*/
if (hrtimer_active(timer)) {
- if (rem.tv64 <= 0)
- rem.tv64 = NSEC_PER_USEC;
+ if (rem <= 0)
+ rem = NSEC_PER_USEC;
} else
- rem.tv64 = 0;
+ rem = 0;
return ktime_to_timeval(rem);
}
@@ -216,12 +216,12 @@ again:
goto again;
}
expires = timeval_to_ktime(value->it_value);
- if (expires.tv64 != 0) {
+ if (expires != 0) {
tsk->signal->it_real_incr =
timeval_to_ktime(value->it_interval);
hrtimer_start(timer, expires, HRTIMER_MODE_REL);
} else
- tsk->signal->it_real_incr.tv64 = 0;
+ tsk->signal->it_real_incr = 0;
trace_itimer_state(ITIMER_REAL, value, 0);
spin_unlock_irq(&tsk->sighand->siglock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 555e21f7b966..a4a0e478e44d 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -59,9 +59,9 @@
#define JIFFIES_SHIFT 8
#endif
-static cycle_t jiffies_read(struct clocksource *cs)
+static u64 jiffies_read(struct clocksource *cs)
{
- return (cycle_t) jiffies;
+ return (u64) jiffies;
}
static struct clocksource clocksource_jiffies = {
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 6df8927c58a5..edf19cc53140 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -381,7 +381,7 @@ ktime_t ntp_get_next_leap(void)
if ((time_state == TIME_INS) && (time_status & STA_INS))
return ktime_set(ntp_next_leap_sec, 0);
- ret.tv64 = KTIME_MAX;
+ ret = KTIME_MAX;
return ret;
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index f246763c9947..e9e8c10f0d9a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -6,7 +6,7 @@
#include <linux/posix-timers.h>
#include <linux/errno.h>
#include <linux/math64.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/kernel_stat.h>
#include <trace/events/timer.h>
#include <linux/tick.h>
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index f2826c35e918..1e6623d76750 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -36,7 +36,7 @@
#include <linux/time.h>
#include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/compiler.h>
@@ -359,7 +359,7 @@ static void schedule_next_timer(struct k_itimer *timr)
{
struct hrtimer *timer = &timr->it.real.timer;
- if (timr->it.real.interval.tv64 == 0)
+ if (timr->it.real.interval == 0)
return;
timr->it_overrun += (unsigned int) hrtimer_forward(timer,
@@ -449,7 +449,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
timr = container_of(timer, struct k_itimer, it.real.timer);
spin_lock_irqsave(&timr->it_lock, flags);
- if (timr->it.real.interval.tv64 != 0)
+ if (timr->it.real.interval != 0)
si_private = ++timr->it_requeue_pending;
if (posix_timer_event(timr, si_private)) {
@@ -458,7 +458,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
* we will not get a call back to restart it AND
* it should be restarted.
*/
- if (timr->it.real.interval.tv64 != 0) {
+ if (timr->it.real.interval != 0) {
ktime_t now = hrtimer_cb_get_time(timer);
/*
@@ -485,9 +485,9 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
*/
#ifdef CONFIG_HIGH_RES_TIMERS
{
- ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ);
+ ktime_t kj = NSEC_PER_SEC / HZ;
- if (timr->it.real.interval.tv64 < kj.tv64)
+ if (timr->it.real.interval < kj)
now = ktime_add(now, kj);
}
#endif
@@ -743,7 +743,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
iv = timr->it.real.interval;
/* interval timer ? */
- if (iv.tv64)
+ if (iv)
cur_setting->it_interval = ktime_to_timespec(iv);
else if (!hrtimer_active(timer) &&
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
@@ -756,13 +756,13 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
* timer move the expiry time forward by intervals, so
* expiry is > now.
*/
- if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
- (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+ if (iv && (timr->it_requeue_pending & REQUEUE_PENDING ||
+ (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
remaining = __hrtimer_expires_remaining_adjusted(timer, now);
/* Return 0 only, when the timer is expired and not pending */
- if (remaining.tv64 <= 0) {
+ if (remaining <= 0) {
/*
* A single shot SIGEV_NONE timer must return 0, when
* it is expired !
@@ -839,7 +839,7 @@ common_timer_set(struct k_itimer *timr, int flags,
common_timer_get(timr, old_setting);
/* disable the timer */
- timr->it.real.interval.tv64 = 0;
+ timr->it.real.interval = 0;
/*
* careful here. If smp we could be in the "fire" routine which will
* be spinning as we hold the lock. But this is ONLY an SMP issue.
@@ -924,7 +924,7 @@ retry:
static int common_timer_del(struct k_itimer *timer)
{
- timer->it.real.interval.tv64 = 0;
+ timer->it.real.interval = 0;
if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
return TIMER_RETRY;
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 690b797f522e..a7bb8f33ae07 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -97,7 +97,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
- if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
+ if (ce_broadcast_hrtimer.next_event != KTIME_MAX)
return HRTIMER_RESTART;
return HRTIMER_NORESTART;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f6aae7977824..3109204c87cc 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -604,14 +604,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
bool bc_local;
raw_spin_lock(&tick_broadcast_lock);
- dev->next_event.tv64 = KTIME_MAX;
- next_event.tv64 = KTIME_MAX;
+ dev->next_event = KTIME_MAX;
+ next_event = KTIME_MAX;
cpumask_clear(tmpmask);
now = ktime_get();
/* Find all expired events */
for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
td = &per_cpu(tick_cpu_device, cpu);
- if (td->evtdev->next_event.tv64 <= now.tv64) {
+ if (td->evtdev->next_event <= now) {
cpumask_set_cpu(cpu, tmpmask);
/*
* Mark the remote cpu in the pending mask, so
@@ -619,8 +619,8 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
* timer in tick_broadcast_oneshot_control().
*/
cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
- } else if (td->evtdev->next_event.tv64 < next_event.tv64) {
- next_event.tv64 = td->evtdev->next_event.tv64;
+ } else if (td->evtdev->next_event < next_event) {
+ next_event = td->evtdev->next_event;
next_cpu = cpu;
}
}
@@ -657,7 +657,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
* - There are pending events on sleeping CPUs which were not
* in the event mask
*/
- if (next_event.tv64 != KTIME_MAX)
+ if (next_event != KTIME_MAX)
tick_broadcast_set_event(dev, next_cpu, next_event);
raw_spin_unlock(&tick_broadcast_lock);
@@ -672,7 +672,7 @@ static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
{
if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER))
return 0;
- if (bc->next_event.tv64 == KTIME_MAX)
+ if (bc->next_event == KTIME_MAX)
return 0;
return bc->bound_on == cpu ? -EBUSY : 0;
}
@@ -688,7 +688,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
if (bc->features & CLOCK_EVT_FEAT_HRTIMER) {
if (broadcast_needs_cpu(bc, smp_processor_id()))
return;
- if (dev->next_event.tv64 < bc->next_event.tv64)
+ if (dev->next_event < bc->next_event)
return;
}
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
@@ -754,7 +754,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
*/
if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) {
ret = -EBUSY;
- } else if (dev->next_event.tv64 < bc->next_event.tv64) {
+ } else if (dev->next_event < bc->next_event) {
tick_broadcast_set_event(bc, cpu, dev->next_event);
/*
* In case of hrtimer broadcasts the
@@ -789,7 +789,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
/*
* Bail out if there is no next event.
*/
- if (dev->next_event.tv64 == KTIME_MAX)
+ if (dev->next_event == KTIME_MAX)
goto out;
/*
* If the pending bit is not set, then we are
@@ -824,7 +824,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
* nohz fixups.
*/
now = ktime_get();
- if (dev->next_event.tv64 <= now.tv64) {
+ if (dev->next_event <= now) {
cpumask_set_cpu(cpu, tick_broadcast_force_mask);
goto out;
}
@@ -871,6 +871,9 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
int cpu = smp_processor_id();
+ if (!bc)
+ return;
+
/* Set it up only once ! */
if (bc->event_handler != tick_handle_oneshot_broadcast) {
int was_periodic = clockevent_state_periodic(bc);
@@ -894,7 +897,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
tick_next_period);
tick_broadcast_set_event(bc, cpu, tick_next_period);
} else
- bc->next_event.tv64 = KTIME_MAX;
+ bc->next_event = KTIME_MAX;
} else {
/*
* The first cpu which switches to oneshot mode sets
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 4fcd99e12aa0..49edc1c4f3e6 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -178,8 +178,8 @@ static void tick_setup_device(struct tick_device *td,
struct clock_event_device *newdev, int cpu,
const struct cpumask *cpumask)
{
- ktime_t next_event;
void (*handler)(struct clock_event_device *) = NULL;
+ ktime_t next_event = 0;
/*
* First device setup ?
@@ -195,7 +195,7 @@ static void tick_setup_device(struct tick_device *td,
else
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
tick_next_period = ktime_get();
- tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
+ tick_period = NSEC_PER_SEC / HZ;
}
/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index b51344652330..6b009c207671 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -28,7 +28,7 @@ int tick_program_event(ktime_t expires, int force)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- if (unlikely(expires.tv64 == KTIME_MAX)) {
+ if (unlikely(expires == KTIME_MAX)) {
/*
* We don't need the clock event device any more, stop it.
*/
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 71496a20e670..74e0388cc88d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -58,21 +58,21 @@ static void tick_do_update_jiffies64(ktime_t now)
* Do a quick check without holding jiffies_lock:
*/
delta = ktime_sub(now, last_jiffies_update);
- if (delta.tv64 < tick_period.tv64)
+ if (delta < tick_period)
return;
/* Reevaluate with jiffies_lock held */
write_seqlock(&jiffies_lock);
delta = ktime_sub(now, last_jiffies_update);
- if (delta.tv64 >= tick_period.tv64) {
+ if (delta >= tick_period) {
delta = ktime_sub(delta, tick_period);
last_jiffies_update = ktime_add(last_jiffies_update,
tick_period);
/* Slow path for long timeouts */
- if (unlikely(delta.tv64 >= tick_period.tv64)) {
+ if (unlikely(delta >= tick_period)) {
s64 incr = ktime_to_ns(tick_period);
ticks = ktime_divns(delta, incr);
@@ -101,7 +101,7 @@ static ktime_t tick_init_jiffy_update(void)
write_seqlock(&jiffies_lock);
/* Did we start the jiffies update yet ? */
- if (last_jiffies_update.tv64 == 0)
+ if (last_jiffies_update == 0)
last_jiffies_update = tick_next_period;
period = last_jiffies_update;
write_sequnlock(&jiffies_lock);
@@ -669,7 +669,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
/* Read jiffies and the time when jiffies were updated last */
do {
seq = read_seqbegin(&jiffies_lock);
- basemono = last_jiffies_update.tv64;
+ basemono = last_jiffies_update;
basejiff = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
@@ -697,7 +697,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
*/
delta = next_tick - basemono;
if (delta <= (u64)TICK_NSEC) {
- tick.tv64 = 0;
+ tick = 0;
/*
* Tell the timer code that the base is not idle, i.e. undo
@@ -764,10 +764,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
expires = KTIME_MAX;
expires = min_t(u64, expires, next_tick);
- tick.tv64 = expires;
+ tick = expires;
/* Skip reprogram of event if its not changed */
- if (ts->tick_stopped && (expires == dev->next_event.tv64))
+ if (ts->tick_stopped && (expires == ts->next_tick))
goto out;
/*
@@ -787,6 +787,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
+ ts->next_tick = tick;
+
/*
* If the expiration time == KTIME_MAX, then we simply stop
* the tick timer.
@@ -802,7 +804,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
else
tick_program_event(tick, 1);
out:
- /* Update the estimated sleep length */
+ /*
+ * Update the estimated sleep length until the next timer
+ * (not only the tick).
+ */
ts->sleep_length = ktime_sub(dev->next_event, now);
return tick;
}
@@ -864,7 +869,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
}
if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
- ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
+ ts->sleep_length = NSEC_PER_SEC / HZ;
return false;
}
@@ -914,7 +919,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
ts->idle_calls++;
expires = tick_nohz_stop_sched_tick(ts, now, cpu);
- if (expires.tv64 > 0LL) {
+ if (expires > 0LL) {
ts->idle_sleeps++;
ts->idle_expires = expires;
}
@@ -1051,7 +1056,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
- dev->next_event.tv64 = KTIME_MAX;
+ dev->next_event = KTIME_MAX;
tick_sched_do_timer(now);
tick_sched_handle(ts, regs);
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index bf38226e5c17..075444e3d48e 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -27,6 +27,7 @@ enum tick_nohz_mode {
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
+ * @next_tick: Next tick to be fired when in dynticks mode.
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_calls: Total number of idle calls
@@ -44,6 +45,7 @@ struct tick_sched {
unsigned long check_clocks;
enum tick_nohz_mode nohz_mode;
ktime_t last_tick;
+ ktime_t next_tick;
int inidle;
int tick_stopped;
unsigned long idle_jiffies;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index bd62fb8e8e77..a3a9a8a029dc 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -38,7 +38,7 @@
#include <linux/math64.h>
#include <linux/ptrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <generated/timeconst.h>
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 4687b3104bae..8afd78932bdf 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(timecounter_init);
*/
static u64 timecounter_read_delta(struct timecounter *tc)
{
- cycle_t cycle_now, cycle_delta;
+ u64 cycle_now, cycle_delta;
u64 ns_offset;
/* read cycle counter: */
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(timecounter_read);
* time previous to the time stored in the cycle counter.
*/
static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
- cycle_t cycles, u64 mask, u64 frac)
+ u64 cycles, u64 mask, u64 frac)
{
u64 ns = (u64) cycles;
@@ -90,7 +90,7 @@ static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
}
u64 timecounter_cyc2time(struct timecounter *tc,
- cycle_t cycle_tstamp)
+ u64 cycle_tstamp)
{
u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
u64 nsec = tc->nsec, frac = tc->frac;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index da233cdf89b0..db087d7e106d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -104,7 +104,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
*/
set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
-tk->wall_to_monotonic.tv_nsec);
- WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64);
+ WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
tk->wall_to_monotonic = wtm;
set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
tk->offs_real = timespec64_to_ktime(tmp);
@@ -119,10 +119,10 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{
- cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+ u64 max_cycles = tk->tkr_mono.clock->max_cycles;
const char *name = tk->tkr_mono.clock->name;
if (offset > max_cycles) {
@@ -158,10 +158,10 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
}
}
-static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
{
struct timekeeper *tk = &tk_core.timekeeper;
- cycle_t now, last, mask, max, delta;
+ u64 now, last, mask, max, delta;
unsigned int seq;
/*
@@ -199,12 +199,12 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
return delta;
}
#else
-static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{
}
-static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
{
- cycle_t cycle_now, delta;
+ u64 cycle_now, delta;
/* read clocksource */
cycle_now = tkr->read(tkr->clock);
@@ -229,7 +229,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
*/
static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
{
- cycle_t interval;
+ u64 interval;
u64 tmp, ntpinterval;
struct clocksource *old_clock;
@@ -254,7 +254,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
if (tmp == 0)
tmp = 1;
- interval = (cycle_t) tmp;
+ interval = (u64) tmp;
tk->cycle_interval = interval;
/* Go back from cycles -> shifted ns */
@@ -298,8 +298,7 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
static inline u32 arch_gettimeoffset(void) { return 0; }
#endif
-static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
- cycle_t delta)
+static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, u64 delta)
{
u64 nsec;
@@ -312,16 +311,15 @@ static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
static inline u64 timekeeping_get_ns(struct tk_read_base *tkr)
{
- cycle_t delta;
+ u64 delta;
delta = timekeeping_get_delta(tkr);
return timekeeping_delta_to_ns(tkr, delta);
}
-static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
- cycle_t cycles)
+static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, u64 cycles)
{
- cycle_t delta;
+ u64 delta;
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
@@ -454,9 +452,9 @@ u64 notrace ktime_get_boot_fast_ns(void)
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
/* Suspend-time cycles value for halted fast timekeeper. */
-static cycle_t cycles_at_suspend;
+static u64 cycles_at_suspend;
-static cycle_t dummy_clock_read(struct clocksource *cs)
+static u64 dummy_clock_read(struct clocksource *cs)
{
return cycles_at_suspend;
}
@@ -573,7 +571,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
static inline void tk_update_leap_state(struct timekeeper *tk)
{
tk->next_leap_ktime = ntp_get_next_leap();
- if (tk->next_leap_ktime.tv64 != KTIME_MAX)
+ if (tk->next_leap_ktime != KTIME_MAX)
/* Convert to monotonic time */
tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
}
@@ -650,7 +648,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
static void timekeeping_forward_now(struct timekeeper *tk)
{
struct clocksource *clock = tk->tkr_mono.clock;
- cycle_t cycle_now, delta;
+ u64 cycle_now, delta;
u64 nsec;
cycle_now = tk->tkr_mono.read(clock);
@@ -923,7 +921,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
ktime_t base_real;
u64 nsec_raw;
u64 nsec_real;
- cycle_t now;
+ u64 now;
WARN_ON_ONCE(timekeeping_suspended);
@@ -982,8 +980,8 @@ static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
* interval is partial_history_cycles.
*/
static int adjust_historical_crosststamp(struct system_time_snapshot *history,
- cycle_t partial_history_cycles,
- cycle_t total_history_cycles,
+ u64 partial_history_cycles,
+ u64 total_history_cycles,
bool discontinuity,
struct system_device_crosststamp *ts)
{
@@ -1047,7 +1045,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
/*
* cycle_between - true if test occurs chronologically between before and after
*/
-static bool cycle_between(cycle_t before, cycle_t test, cycle_t after)
+static bool cycle_between(u64 before, u64 test, u64 after)
{
if (test > before && test < after)
return true;
@@ -1077,7 +1075,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
{
struct system_counterval_t system_counterval;
struct timekeeper *tk = &tk_core.timekeeper;
- cycle_t cycles, now, interval_start;
+ u64 cycles, now, interval_start;
unsigned int clock_was_set_seq = 0;
ktime_t base_real, base_raw;
u64 nsec_real, nsec_raw;
@@ -1138,7 +1136,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
* current interval
*/
if (do_interp) {
- cycle_t partial_history_cycles, total_history_cycles;
+ u64 partial_history_cycles, total_history_cycles;
bool discontinuity;
/*
@@ -1644,7 +1642,7 @@ void timekeeping_resume(void)
struct clocksource *clock = tk->tkr_mono.clock;
unsigned long flags;
struct timespec64 ts_new, ts_delta;
- cycle_t cycle_now;
+ u64 cycle_now;
sleeptime_injected = false;
read_persistent_clock64(&ts_new);
@@ -2010,11 +2008,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
*
* Returns the unconsumed cycles.
*/
-static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
- u32 shift,
- unsigned int *clock_set)
+static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
+ u32 shift, unsigned int *clock_set)
{
- cycle_t interval = tk->cycle_interval << shift;
+ u64 interval = tk->cycle_interval << shift;
u64 raw_nsecs;
/* If the offset is smaller than a shifted interval, do nothing */
@@ -2055,7 +2052,7 @@ void update_wall_time(void)
{
struct timekeeper *real_tk = &tk_core.timekeeper;
struct timekeeper *tk = &shadow_timekeeper;
- cycle_t offset;
+ u64 offset;
int shift = 0, maxshift;
unsigned int clock_set = 0;
unsigned long flags;
@@ -2253,7 +2250,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
}
/* Handle leapsecond insertion adjustments */
- if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64))
+ if (unlikely(base >= tk->next_leap_ktime))
*offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
} while (read_seqcount_retry(&tk_core.seq, seq));
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 5be76270ec4a..9a18f121f399 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -13,9 +13,9 @@ extern void tk_debug_account_sleep_time(struct timespec64 *t);
#endif
#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
-static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
{
- cycle_t ret = (now - last) & mask;
+ u64 ret = (now - last) & mask;
/*
* Prevent time going backwards by checking the MSB of mask in
@@ -24,7 +24,7 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
return ret & ~(mask >> 1) ? 0 : ret;
}
#else
-static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
{
return (now - last) & mask;
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ea4fbf8477a9..ec33a6933eae 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -43,7 +43,7 @@
#include <linux/slab.h>
#include <linux/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ba7d8b288bb3..afe6cd1944fc 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -17,7 +17,7 @@
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "tick-internal.h"
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 087204c733eb..afddded947df 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -43,7 +43,7 @@
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* This is our basic unit of interest: a timer expiry event identified
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2a96b063d659..d5038005eb5d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -70,6 +70,7 @@ config FTRACE_NMI_ENTER
config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
+ select GLOB
bool
config CONTEXT_SWITCH_TRACER
@@ -133,6 +134,7 @@ config FUNCTION_TRACER
select KALLSYMS
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
+ select GLOB
help
Enable the kernel to trace every kernel function. This is done
by using a compiler feature to insert a small, 5-byte No-Operation
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 33dd57f53f88..eb230f06ba41 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2847,7 +2847,7 @@ static void ftrace_shutdown_sysctl(void)
}
}
-static cycle_t ftrace_update_time;
+static u64 ftrace_update_time;
unsigned long ftrace_update_tot_cnt;
static inline int ops_traces_mod(struct ftrace_ops *ops)
@@ -2894,7 +2894,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
struct ftrace_page *pg;
struct dyn_ftrace *p;
- cycle_t start, stop;
+ u64 start, stop;
unsigned long update_cnt = 0;
unsigned long rec_flags = 0;
int i;
@@ -3511,6 +3511,10 @@ static int ftrace_match(char *str, struct ftrace_glob *g)
memcmp(str + slen - g->len, g->search, g->len) == 0)
matched = 1;
break;
+ case MATCH_GLOB:
+ if (glob_match(g->search, str))
+ matched = 1;
+ break;
}
return matched;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 89a2611a1635..a85739efcc30 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -245,7 +245,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
EXPORT_SYMBOL_GPL(ring_buffer_event_length);
/* inline for ring buffer fast paths */
-static void *
+static __always_inline void *
rb_event_data(struct ring_buffer_event *event)
{
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
@@ -1798,48 +1798,48 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
}
EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
-static inline void *
+static __always_inline void *
__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
{
return bpage->data + index;
}
-static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
+static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
{
return bpage->page->data + index;
}
-static inline struct ring_buffer_event *
+static __always_inline struct ring_buffer_event *
rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
{
return __rb_page_index(cpu_buffer->reader_page,
cpu_buffer->reader_page->read);
}
-static inline struct ring_buffer_event *
+static __always_inline struct ring_buffer_event *
rb_iter_head_event(struct ring_buffer_iter *iter)
{
return __rb_page_index(iter->head_page, iter->head);
}
-static inline unsigned rb_page_commit(struct buffer_page *bpage)
+static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
{
return local_read(&bpage->page->commit);
}
/* Size is determined by what has been committed */
-static inline unsigned rb_page_size(struct buffer_page *bpage)
+static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
{
return rb_page_commit(bpage);
}
-static inline unsigned
+static __always_inline unsigned
rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
{
return rb_page_commit(cpu_buffer->commit_page);
}
-static inline unsigned
+static __always_inline unsigned
rb_event_index(struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
@@ -2355,7 +2355,7 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
local_inc(&cpu_buffer->commits);
}
-static void
+static __always_inline void
rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long max_count;
@@ -2410,7 +2410,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
goto again;
}
-static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long commits;
@@ -2455,7 +2455,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
event->time_delta = 1;
}
-static inline bool
+static __always_inline bool
rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
@@ -2469,7 +2469,7 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
rb_commit_index(cpu_buffer) == index;
}
-static void
+static __always_inline void
rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
@@ -2702,7 +2702,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
return event;
}
-static struct ring_buffer_event *
+static __always_inline struct ring_buffer_event *
rb_reserve_next_event(struct ring_buffer *buffer,
struct ring_buffer_per_cpu *cpu_buffer,
unsigned long length)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 54d5270a5042..d7449783987a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -40,6 +40,7 @@
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
+#include <linux/trace.h>
#include <linux/sched/rt.h>
#include "trace.h"
@@ -68,6 +69,7 @@ bool __read_mostly tracing_selftest_disabled;
/* Pipe tracepoints to printk */
struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
+static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
/* For tracers that don't implement custom flags */
static struct tracer_opt dummy_tracer_opt[] = {
@@ -234,7 +236,7 @@ static int __init set_tracepoint_printk(char *str)
}
__setup("tp_printk", set_tracepoint_printk);
-unsigned long long ns2usecs(cycle_t nsec)
+unsigned long long ns2usecs(u64 nsec)
{
nsec += 500;
do_div(nsec, 1000);
@@ -571,7 +573,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
return read;
}
-static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
+static u64 buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
u64 ts;
@@ -585,7 +587,7 @@ static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
return ts;
}
-cycle_t ftrace_now(int cpu)
+u64 ftrace_now(int cpu)
{
return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
}
@@ -738,6 +740,31 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
#endif
+static __always_inline void
+trace_event_setup(struct ring_buffer_event *event,
+ int type, unsigned long flags, int pc)
+{
+ struct trace_entry *ent = ring_buffer_event_data(event);
+
+ tracing_generic_entry_update(ent, flags, pc);
+ ent->type = type;
+}
+
+static __always_inline struct ring_buffer_event *
+__trace_buffer_lock_reserve(struct ring_buffer *buffer,
+ int type,
+ unsigned long len,
+ unsigned long flags, int pc)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_lock_reserve(buffer, len);
+ if (event != NULL)
+ trace_event_setup(event, type, flags, pc);
+
+ return event;
+}
+
static void tracer_tracing_on(struct trace_array *tr)
{
if (tr->trace_buffer.buffer)
@@ -767,6 +794,22 @@ void tracing_on(void)
}
EXPORT_SYMBOL_GPL(tracing_on);
+
+static __always_inline void
+__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+{
+ __this_cpu_write(trace_cmdline_save, true);
+
+ /* If this is the temp buffer, we need to commit fully */
+ if (this_cpu_read(trace_buffered_event) == event) {
+ /* Length is in event->array[0] */
+ ring_buffer_write(buffer, event->array[0], &event->array[1]);
+ /* Release the temp buffer */
+ this_cpu_dec(trace_buffered_event_cnt);
+ } else
+ ring_buffer_unlock_commit(buffer, event);
+}
+
/**
* __trace_puts - write a constant string into the trace buffer.
* @ip: The address of the caller
@@ -794,8 +837,8 @@ int __trace_puts(unsigned long ip, const char *str, int size)
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
- irq_flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ irq_flags, pc);
if (!event)
return 0;
@@ -842,8 +885,8 @@ int __trace_bputs(unsigned long ip, const char *str)
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
- irq_flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
+ irq_flags, pc);
if (!event)
return 0;
@@ -1907,35 +1950,19 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
#endif
((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
- ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+ ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
-static __always_inline void
-trace_event_setup(struct ring_buffer_event *event,
- int type, unsigned long flags, int pc)
-{
- struct trace_entry *ent = ring_buffer_event_data(event);
-
- tracing_generic_entry_update(ent, flags, pc);
- ent->type = type;
-}
-
struct ring_buffer_event *
trace_buffer_lock_reserve(struct ring_buffer *buffer,
int type,
unsigned long len,
unsigned long flags, int pc)
{
- struct ring_buffer_event *event;
-
- event = ring_buffer_lock_reserve(buffer, len);
- if (event != NULL)
- trace_event_setup(event, type, flags, pc);
-
- return event;
+ return __trace_buffer_lock_reserve(buffer, type, len, flags, pc);
}
DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -2049,21 +2076,6 @@ void trace_buffered_event_disable(void)
preempt_enable();
}
-void
-__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
-{
- __this_cpu_write(trace_cmdline_save, true);
-
- /* If this is the temp buffer, we need to commit fully */
- if (this_cpu_read(trace_buffered_event) == event) {
- /* Length is in event->array[0] */
- ring_buffer_write(buffer, event->array[0], &event->array[1]);
- /* Release the temp buffer */
- this_cpu_dec(trace_buffered_event_cnt);
- } else
- ring_buffer_unlock_commit(buffer, event);
-}
-
static struct ring_buffer *temp_buffer;
struct ring_buffer_event *
@@ -2090,8 +2102,8 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
this_cpu_dec(trace_buffered_event_cnt);
}
- entry = trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
/*
* If tracing is off, but we have triggers enabled
* we still need to look at the event data. Use the temp_buffer
@@ -2100,13 +2112,88 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
*/
if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
*current_rb = temp_buffer;
- entry = trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
}
return entry;
}
EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
+static DEFINE_SPINLOCK(tracepoint_iter_lock);
+static DEFINE_MUTEX(tracepoint_printk_mutex);
+
+static void output_printk(struct trace_event_buffer *fbuffer)
+{
+ struct trace_event_call *event_call;
+ struct trace_event *event;
+ unsigned long flags;
+ struct trace_iterator *iter = tracepoint_print_iter;
+
+ /* We should never get here if iter is NULL */
+ if (WARN_ON_ONCE(!iter))
+ return;
+
+ event_call = fbuffer->trace_file->event_call;
+ if (!event_call || !event_call->event.funcs ||
+ !event_call->event.funcs->trace)
+ return;
+
+ event = &fbuffer->trace_file->event_call->event;
+
+ spin_lock_irqsave(&tracepoint_iter_lock, flags);
+ trace_seq_init(&iter->seq);
+ iter->ent = fbuffer->entry;
+ event_call->event.funcs->trace(iter, 0, event);
+ trace_seq_putc(&iter->seq, 0);
+ printk("%s", iter->seq.buffer);
+
+ spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+}
+
+int tracepoint_printk_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int save_tracepoint_printk;
+ int ret;
+
+ mutex_lock(&tracepoint_printk_mutex);
+ save_tracepoint_printk = tracepoint_printk;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ /*
+ * This will force exiting early, as tracepoint_printk
+ * is always zero when tracepoint_printk_iter is not allocated
+ */
+ if (!tracepoint_print_iter)
+ tracepoint_printk = 0;
+
+ if (save_tracepoint_printk == tracepoint_printk)
+ goto out;
+
+ if (tracepoint_printk)
+ static_key_enable(&tracepoint_printk_key.key);
+ else
+ static_key_disable(&tracepoint_printk_key.key);
+
+ out:
+ mutex_unlock(&tracepoint_printk_mutex);
+
+ return ret;
+}
+
+void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
+{
+ if (static_key_false(&tracepoint_printk_key.key))
+ output_printk(fbuffer);
+
+ event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
+ fbuffer->event, fbuffer->entry,
+ fbuffer->flags, fbuffer->pc);
+}
+EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
+
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct ring_buffer *buffer,
struct ring_buffer_event *event,
@@ -2129,6 +2216,139 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
ftrace_trace_userstack(buffer, flags, pc);
}
+/*
+ * Similar to trace_buffer_unlock_commit_regs() but do not dump stack.
+ */
+void
+trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ __buffer_unlock_commit(buffer, event);
+}
+
+static void
+trace_process_export(struct trace_export *export,
+ struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+ unsigned int size = 0;
+
+ entry = ring_buffer_event_data(event);
+ size = ring_buffer_event_length(event);
+ export->write(entry, size);
+}
+
+static DEFINE_MUTEX(ftrace_export_lock);
+
+static struct trace_export __rcu *ftrace_exports_list __read_mostly;
+
+static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled);
+
+static inline void ftrace_exports_enable(void)
+{
+ static_branch_enable(&ftrace_exports_enabled);
+}
+
+static inline void ftrace_exports_disable(void)
+{
+ static_branch_disable(&ftrace_exports_enabled);
+}
+
+void ftrace_exports(struct ring_buffer_event *event)
+{
+ struct trace_export *export;
+
+ preempt_disable_notrace();
+
+ export = rcu_dereference_raw_notrace(ftrace_exports_list);
+ while (export) {
+ trace_process_export(export, event);
+ export = rcu_dereference_raw_notrace(export->next);
+ }
+
+ preempt_enable_notrace();
+}
+
+static inline void
+add_trace_export(struct trace_export **list, struct trace_export *export)
+{
+ rcu_assign_pointer(export->next, *list);
+ /*
+ * We are entering export into the list but another
+ * CPU might be walking that list. We need to make sure
+ * the export->next pointer is valid before another CPU sees
+ * the export pointer included into the list.
+ */
+ rcu_assign_pointer(*list, export);
+}
+
+static inline int
+rm_trace_export(struct trace_export **list, struct trace_export *export)
+{
+ struct trace_export **p;
+
+ for (p = list; *p != NULL; p = &(*p)->next)
+ if (*p == export)
+ break;
+
+ if (*p != export)
+ return -1;
+
+ rcu_assign_pointer(*p, (*p)->next);
+
+ return 0;
+}
+
+static inline void
+add_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+ if (*list == NULL)
+ ftrace_exports_enable();
+
+ add_trace_export(list, export);
+}
+
+static inline int
+rm_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+ int ret;
+
+ ret = rm_trace_export(list, export);
+ if (*list == NULL)
+ ftrace_exports_disable();
+
+ return ret;
+}
+
+int register_ftrace_export(struct trace_export *export)
+{
+ if (WARN_ON_ONCE(!export->write))
+ return -1;
+
+ mutex_lock(&ftrace_export_lock);
+
+ add_ftrace_export(&ftrace_exports_list, export);
+
+ mutex_unlock(&ftrace_export_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_export);
+
+int unregister_ftrace_export(struct trace_export *export)
+{
+ int ret;
+
+ mutex_lock(&ftrace_export_lock);
+
+ ret = rm_ftrace_export(&ftrace_exports_list, export);
+
+ mutex_unlock(&ftrace_export_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_export);
+
void
trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -2139,16 +2359,19 @@ trace_function(struct trace_array *tr,
struct ring_buffer_event *event;
struct ftrace_entry *entry;
- event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
- flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
+ flags, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
entry->ip = ip;
entry->parent_ip = parent_ip;
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!call_filter_check_discard(call, entry, buffer, event)) {
+ if (static_branch_unlikely(&ftrace_exports_enabled))
+ ftrace_exports(event);
__buffer_unlock_commit(buffer, event);
+ }
}
#ifdef CONFIG_STACKTRACE
@@ -2216,8 +2439,8 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
size *= sizeof(unsigned long);
- event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
- sizeof(*entry) + size, flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
+ sizeof(*entry) + size, flags, pc);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -2318,8 +2541,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
__this_cpu_inc(user_stack_count);
- event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
- sizeof(*entry), flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
+ sizeof(*entry), flags, pc);
if (!event)
goto out_drop_count;
entry = ring_buffer_event_data(event);
@@ -2489,8 +2712,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
local_save_flags(flags);
size = sizeof(*entry) + sizeof(u32) * len;
buffer = tr->trace_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
- flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
+ flags, pc);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -2545,8 +2768,8 @@ __trace_array_vprintk(struct ring_buffer *buffer,
local_save_flags(flags);
size = sizeof(*entry) + len + 1;
- event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ flags, pc);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -4055,6 +4278,7 @@ static const char readme_msg[] =
" x86-tsc: TSC cycle counter\n"
#endif
"\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
+ "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
" tracing_cpumask\t- Limit which CPUs to trace\n"
" instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
"\t\t\t Remove sub-buffer with rmdir\n"
@@ -4066,7 +4290,7 @@ static const char readme_msg[] =
"\n available_filter_functions - list of functions that can be filtered on\n"
" set_ftrace_filter\t- echo function name in here to only trace these\n"
"\t\t\t functions\n"
- "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+ "\t accepts: func_full_name or glob-matching-pattern\n"
"\t modules: Can select a group via module\n"
"\t Format: :mod:<module-name>\n"
"\t example: echo :mod:ext3 > set_ftrace_filter\n"
@@ -5519,21 +5743,18 @@ static ssize_t
tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
- unsigned long addr = (unsigned long)ubuf;
struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
- struct page *pages[2];
- void *map_page[2];
- int nr_pages = 1;
+ const char faulted[] = "<faulted>";
ssize_t written;
- int offset;
int size;
int len;
- int ret;
- int i;
+
+/* Used in tracing_mark_raw_write() as well */
+#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */
if (tracing_disabled)
return -EINVAL;
@@ -5544,60 +5765,33 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (cnt > TRACE_BUF_SIZE)
cnt = TRACE_BUF_SIZE;
- /*
- * Userspace is injecting traces into the kernel trace buffer.
- * We want to be as non intrusive as possible.
- * To do so, we do not want to allocate any special buffers
- * or take any locks, but instead write the userspace data
- * straight into the ring buffer.
- *
- * First we need to pin the userspace buffer into memory,
- * which, most likely it is, because it just referenced it.
- * But there's no guarantee that it is. By using get_user_pages_fast()
- * and kmap_atomic/kunmap_atomic() we can get access to the
- * pages directly. We then write the data directly into the
- * ring buffer.
- */
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- /* check if we cross pages */
- if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
- nr_pages = 2;
-
- offset = addr & (PAGE_SIZE - 1);
- addr &= PAGE_MASK;
-
- ret = get_user_pages_fast(addr, nr_pages, 0, pages);
- if (ret < nr_pages) {
- while (--ret >= 0)
- put_page(pages[ret]);
- written = -EFAULT;
- goto out;
- }
+ local_save_flags(irq_flags);
+ size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
- for (i = 0; i < nr_pages; i++)
- map_page[i] = kmap_atomic(pages[i]);
+ /* If less than "<faulted>", then make sure we can still add that */
+ if (cnt < FAULTED_SIZE)
+ size += FAULTED_SIZE - cnt;
- local_save_flags(irq_flags);
- size = sizeof(*entry) + cnt + 2; /* possible \n added */
buffer = tr->trace_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- irq_flags, preempt_count());
- if (!event) {
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ irq_flags, preempt_count());
+ if (unlikely(!event))
/* Ring buffer disabled, return as if not open for write */
- written = -EBADF;
- goto out_unlock;
- }
+ return -EBADF;
entry = ring_buffer_event_data(event);
entry->ip = _THIS_IP_;
- if (nr_pages == 2) {
- len = PAGE_SIZE - offset;
- memcpy(&entry->buf, map_page[0] + offset, len);
- memcpy(&entry->buf[len], map_page[1], cnt - len);
+ len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
+ if (len) {
+ memcpy(&entry->buf, faulted, FAULTED_SIZE);
+ cnt = FAULTED_SIZE;
+ written = -EFAULT;
} else
- memcpy(&entry->buf, map_page[0] + offset, cnt);
+ written = cnt;
+ len = cnt;
if (entry->buf[cnt - 1] != '\n') {
entry->buf[cnt] = '\n';
@@ -5607,16 +5801,73 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
__buffer_unlock_commit(buffer, event);
- written = cnt;
+ if (written > 0)
+ *fpos += written;
- *fpos += written;
+ return written;
+}
+
+/* Limit it for now to 3K (including tag) */
+#define RAW_DATA_MAX_SIZE (1024*3)
+
+static ssize_t
+tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ struct trace_array *tr = filp->private_data;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct raw_data_entry *entry;
+ const char faulted[] = "<faulted>";
+ unsigned long irq_flags;
+ ssize_t written;
+ int size;
+ int len;
+
+#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
+
+ if (tracing_disabled)
+ return -EINVAL;
+
+ if (!(tr->trace_flags & TRACE_ITER_MARKERS))
+ return -EINVAL;
+
+ /* The marker must at least have a tag id */
+ if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE)
+ return -EINVAL;
+
+ if (cnt > TRACE_BUF_SIZE)
+ cnt = TRACE_BUF_SIZE;
+
+ BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+
+ local_save_flags(irq_flags);
+ size = sizeof(*entry) + cnt;
+ if (cnt < FAULT_SIZE_ID)
+ size += FAULT_SIZE_ID - cnt;
+
+ buffer = tr->trace_buffer.buffer;
+ event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
+ irq_flags, preempt_count());
+ if (!event)
+ /* Ring buffer disabled, return as if not open for write */
+ return -EBADF;
+
+ entry = ring_buffer_event_data(event);
+
+ len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
+ if (len) {
+ entry->id = -1;
+ memcpy(&entry->buf, faulted, FAULTED_SIZE);
+ written = -EFAULT;
+ } else
+ written = cnt;
+
+ __buffer_unlock_commit(buffer, event);
+
+ if (written > 0)
+ *fpos += written;
- out_unlock:
- for (i = nr_pages - 1; i >= 0; i--) {
- kunmap_atomic(map_page[i]);
- put_page(pages[i]);
- }
- out:
return written;
}
@@ -5946,6 +6197,13 @@ static const struct file_operations tracing_mark_fops = {
.release = tracing_release_generic_tr,
};
+static const struct file_operations tracing_mark_raw_fops = {
+ .open = tracing_open_generic_tr,
+ .write = tracing_mark_raw_write,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
static const struct file_operations trace_clock_fops = {
.open = tracing_clock_open,
.read = seq_read,
@@ -7215,6 +7473,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
+ trace_create_file("trace_marker_raw", 0220, d_tracer,
+ tr, &tracing_mark_raw_fops);
+
trace_create_file("trace_clock", 0644, d_tracer, tr,
&trace_clock_fops);
@@ -7752,6 +8013,8 @@ void __init trace_init(void)
kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
if (WARN_ON(!tracepoint_print_iter))
tracepoint_printk = 0;
+ else
+ static_key_enable(&tracepoint_printk_key.key);
}
tracer_alloc_buffers();
trace_event_init();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd24b1f9ac43..1ea51ab53edf 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -15,6 +15,7 @@
#include <linux/trace_events.h>
#include <linux/compiler.h>
#include <linux/trace_seq.h>
+#include <linux/glob.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -39,6 +40,7 @@ enum trace_type {
TRACE_BLK,
TRACE_BPUTS,
TRACE_HWLAT,
+ TRACE_RAW_DATA,
__TRACE_LAST_TYPE,
};
@@ -157,7 +159,7 @@ struct trace_array_cpu {
unsigned long policy;
unsigned long rt_priority;
unsigned long skipped_entries;
- cycle_t preempt_timestamp;
+ u64 preempt_timestamp;
pid_t pid;
kuid_t uid;
char comm[TASK_COMM_LEN];
@@ -175,7 +177,7 @@ struct trace_buffer {
struct trace_array *tr;
struct ring_buffer *buffer;
struct trace_array_cpu __percpu *data;
- cycle_t time_start;
+ u64 time_start;
int cpu;
};
@@ -330,6 +332,7 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \
+ IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -599,8 +602,8 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
-void __buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event);
+void trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer,
+ struct ring_buffer_event *event);
int trace_empty(struct trace_iterator *iter);
@@ -686,7 +689,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
}
#endif /* CONFIG_STACKTRACE */
-extern cycle_t ftrace_now(int cpu);
+extern u64 ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
@@ -733,7 +736,7 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
-extern unsigned long long ns2usecs(cycle_t nsec);
+extern unsigned long long ns2usecs(u64 nsec);
extern int
trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
extern int
@@ -843,6 +846,17 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
return 0;
}
#endif /* CONFIG_DYNAMIC_FTRACE */
+
+extern unsigned int fgraph_max_depth;
+
+static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
+{
+ /* trace it when it is-nested-in or is a function enabled. */
+ return !(trace->depth || ftrace_graph_addr(trace->func)) ||
+ (trace->depth < 0) ||
+ (fgraph_max_depth && trace->depth >= fgraph_max_depth);
+}
+
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags)
@@ -1257,6 +1271,7 @@ enum regex_type {
MATCH_FRONT_ONLY,
MATCH_MIDDLE_ONLY,
MATCH_END_ONLY,
+ MATCH_GLOB,
};
struct regex {
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 0f109c4130d3..e3b488825ae3 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -21,6 +21,8 @@ static u64 bm_stddev;
static unsigned int bm_avg;
static unsigned int bm_std;
+static bool ok_to_run;
+
/*
* This gets called in a loop recording the time it took to write
* the tracepoint. What it writes is the time statistics of the last
@@ -164,11 +166,21 @@ static int benchmark_event_kthread(void *arg)
* When the benchmark tracepoint is enabled, it calls this
* function and the thread that calls the tracepoint is created.
*/
-void trace_benchmark_reg(void)
+int trace_benchmark_reg(void)
{
+ if (!ok_to_run) {
+ pr_warning("trace benchmark cannot be started via kernel command line\n");
+ return -EBUSY;
+ }
+
bm_event_thread = kthread_run(benchmark_event_kthread,
NULL, "event_benchmark");
- WARN_ON(!bm_event_thread);
+ if (!bm_event_thread) {
+ pr_warning("trace benchmark failed to create kernel thread\n");
+ return -ENOMEM;
+ }
+
+ return 0;
}
/*
@@ -182,6 +194,7 @@ void trace_benchmark_unreg(void)
return;
kthread_stop(bm_event_thread);
+ bm_event_thread = NULL;
strcpy(bm_str, "START");
bm_total = 0;
@@ -196,3 +209,12 @@ void trace_benchmark_unreg(void)
bm_avg = 0;
bm_stddev = 0;
}
+
+static __init int ok_to_run_trace_benchmark(void)
+{
+ ok_to_run = true;
+
+ return 0;
+}
+
+early_initcall(ok_to_run_trace_benchmark);
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
index 3c1df1df4e29..ebdbfc2f2a64 100644
--- a/kernel/trace/trace_benchmark.h
+++ b/kernel/trace/trace_benchmark.h
@@ -6,7 +6,7 @@
#include <linux/tracepoint.h>
-extern void trace_benchmark_reg(void);
+extern int trace_benchmark_reg(void);
extern void trace_benchmark_unreg(void);
#define BENCHMARK_EVENT_STRLEN 128
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 3a2a73716a5b..75489de546b6 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -81,7 +81,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry->correct = val == expect;
if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
out:
current->trace_recursion &= ~TRACE_BRANCH_BIT;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index d1cc37e78f99..eb7396b7e7c3 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -244,6 +244,21 @@ FTRACE_ENTRY(print, print_entry,
FILTER_OTHER
);
+FTRACE_ENTRY(raw_data, raw_data_entry,
+
+ TRACE_RAW_DATA,
+
+ F_STRUCT(
+ __field( unsigned int, id )
+ __dynamic_array( char, buf )
+ ),
+
+ F_printk("id:%04x %08x",
+ __entry->id, (int)__entry->buf[0]),
+
+ FILTER_OTHER
+);
+
FTRACE_ENTRY(bputs, bputs_entry,
TRACE_BPUTS,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 03c0a48c3ac4..93116549a284 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -283,46 +283,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
}
EXPORT_SYMBOL_GPL(trace_event_buffer_reserve);
-static DEFINE_SPINLOCK(tracepoint_iter_lock);
-
-static void output_printk(struct trace_event_buffer *fbuffer)
-{
- struct trace_event_call *event_call;
- struct trace_event *event;
- unsigned long flags;
- struct trace_iterator *iter = tracepoint_print_iter;
-
- if (!iter)
- return;
-
- event_call = fbuffer->trace_file->event_call;
- if (!event_call || !event_call->event.funcs ||
- !event_call->event.funcs->trace)
- return;
-
- event = &fbuffer->trace_file->event_call->event;
-
- spin_lock_irqsave(&tracepoint_iter_lock, flags);
- trace_seq_init(&iter->seq);
- iter->ent = fbuffer->entry;
- event_call->event.funcs->trace(iter, 0, event);
- trace_seq_putc(&iter->seq, 0);
- printk("%s", iter->seq.buffer);
-
- spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
-}
-
-void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
-{
- if (tracepoint_printk)
- output_printk(fbuffer);
-
- event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
- fbuffer->event, fbuffer->entry,
- fbuffer->flags, fbuffer->pc);
-}
-EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
-
int trace_event_reg(struct trace_event_call *call,
enum trace_reg type, void *data)
{
@@ -742,6 +702,7 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
struct trace_event_call *call;
const char *name;
int ret = -EINVAL;
+ int eret = 0;
list_for_each_entry(file, &tr->events, list) {
@@ -765,9 +726,17 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
if (event && strcmp(event, name) != 0)
continue;
- ftrace_event_enable_disable(file, set);
+ ret = ftrace_event_enable_disable(file, set);
- ret = 0;
+ /*
+ * Save the first error and return that. Some events
+ * may still have been enabled, but let the user
+ * know that something went wrong.
+ */
+ if (ret && !eret)
+ eret = ret;
+
+ ret = eret;
}
return ret;
@@ -2843,20 +2812,32 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
return -ENOMEM;
}
+ entry = trace_create_file("enable", 0644, d_events,
+ tr, &ftrace_tr_enable_fops);
+ if (!entry) {
+ pr_warn("Could not create tracefs 'enable' entry\n");
+ return -ENOMEM;
+ }
+
+ /* There are not as crucial, just warn if they are not created */
+
entry = tracefs_create_file("set_event_pid", 0644, parent,
tr, &ftrace_set_event_pid_fops);
+ if (!entry)
+ pr_warn("Could not create tracefs 'set_event_pid' entry\n");
/* ring buffer internal formats */
- trace_create_file("header_page", 0444, d_events,
- ring_buffer_print_page_header,
- &ftrace_show_header_fops);
-
- trace_create_file("header_event", 0444, d_events,
- ring_buffer_print_entry_header,
- &ftrace_show_header_fops);
+ entry = trace_create_file("header_page", 0444, d_events,
+ ring_buffer_print_page_header,
+ &ftrace_show_header_fops);
+ if (!entry)
+ pr_warn("Could not create tracefs 'header_page' entry\n");
- trace_create_file("enable", 0644, d_events,
- tr, &ftrace_tr_enable_fops);
+ entry = trace_create_file("header_event", 0444, d_events,
+ ring_buffer_print_entry_header,
+ &ftrace_show_header_fops);
+ if (!entry)
+ pr_warn("Could not create tracefs 'header_event' entry\n");
tr->event_dir = d_events;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9daa9b3bc6d9..59a411ff60c7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -108,12 +108,12 @@ static char *err_text[] = {
};
struct opstack_op {
- int op;
+ enum filter_op_ids op;
struct list_head list;
};
struct postfix_elt {
- int op;
+ enum filter_op_ids op;
char *operand;
struct list_head list;
};
@@ -145,34 +145,50 @@ struct pred_stack {
/* If not of not match is equal to not of not, then it is a match */
#define DEFINE_COMPARISON_PRED(type) \
-static int filter_pred_##type(struct filter_pred *pred, void *event) \
+static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = 0; \
- \
- switch (pred->op) { \
- case OP_LT: \
- match = (*addr < val); \
- break; \
- case OP_LE: \
- match = (*addr <= val); \
- break; \
- case OP_GT: \
- match = (*addr > val); \
- break; \
- case OP_GE: \
- match = (*addr >= val); \
- break; \
- case OP_BAND: \
- match = (*addr & val); \
- break; \
- default: \
- break; \
- } \
- \
+ int match = (*addr < val); \
return !!match == !pred->not; \
-}
+} \
+static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = (*addr <= val); \
+ return !!match == !pred->not; \
+} \
+static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = (*addr > val); \
+ return !!match == !pred->not; \
+} \
+static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = (*addr >= val); \
+ return !!match == !pred->not; \
+} \
+static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = !!(*addr & val); \
+ return match == !pred->not; \
+} \
+static const filter_pred_fn_t pred_funcs_##type[] = { \
+ filter_pred_LT_##type, \
+ filter_pred_LE_##type, \
+ filter_pred_GT_##type, \
+ filter_pred_GE_##type, \
+ filter_pred_BAND_##type, \
+};
+
+#define PRED_FUNC_START OP_LT
#define DEFINE_EQUALITY_PRED(size) \
static int filter_pred_##size(struct filter_pred *pred, void *event) \
@@ -344,6 +360,12 @@ static int regex_match_end(char *str, struct regex *r, int len)
return 0;
}
+static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused)
+{
+ if (glob_match(r->pattern, str))
+ return 1;
+ return 0;
+}
/**
* filter_parse_regex - parse a basic regex
* @buff: the raw regex
@@ -380,14 +402,20 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
if (!i) {
*search = buff + 1;
type = MATCH_END_ONLY;
- } else {
+ } else if (i == len - 1) {
if (type == MATCH_END_ONLY)
type = MATCH_MIDDLE_ONLY;
else
type = MATCH_FRONT_ONLY;
buff[i] = 0;
break;
+ } else { /* pattern continues, use full glob */
+ type = MATCH_GLOB;
+ break;
}
+ } else if (strchr("[?\\", buff[i])) {
+ type = MATCH_GLOB;
+ break;
}
}
@@ -420,6 +448,9 @@ static void filter_build_regex(struct filter_pred *pred)
case MATCH_END_ONLY:
r->match = regex_match_end;
break;
+ case MATCH_GLOB:
+ r->match = regex_match_glob;
+ break;
}
pred->not ^= not;
@@ -946,7 +977,7 @@ int filter_assign_type(const char *type)
return FILTER_OTHER;
}
-static bool is_legal_op(struct ftrace_event_field *field, int op)
+static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op)
{
if (is_string_field(field) &&
(op != OP_EQ && op != OP_NE && op != OP_GLOB))
@@ -957,8 +988,8 @@ static bool is_legal_op(struct ftrace_event_field *field, int op)
return true;
}
-static filter_pred_fn_t select_comparison_fn(int op, int field_size,
- int field_is_signed)
+static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
+ int field_size, int field_is_signed)
{
filter_pred_fn_t fn = NULL;
@@ -967,33 +998,33 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
if (op == OP_EQ || op == OP_NE)
fn = filter_pred_64;
else if (field_is_signed)
- fn = filter_pred_s64;
+ fn = pred_funcs_s64[op - PRED_FUNC_START];
else
- fn = filter_pred_u64;
+ fn = pred_funcs_u64[op - PRED_FUNC_START];
break;
case 4:
if (op == OP_EQ || op == OP_NE)
fn = filter_pred_32;
else if (field_is_signed)
- fn = filter_pred_s32;
+ fn = pred_funcs_s32[op - PRED_FUNC_START];
else
- fn = filter_pred_u32;
+ fn = pred_funcs_u32[op - PRED_FUNC_START];
break;
case 2:
if (op == OP_EQ || op == OP_NE)
fn = filter_pred_16;
else if (field_is_signed)
- fn = filter_pred_s16;
+ fn = pred_funcs_s16[op - PRED_FUNC_START];
else
- fn = filter_pred_u16;
+ fn = pred_funcs_u16[op - PRED_FUNC_START];
break;
case 1:
if (op == OP_EQ || op == OP_NE)
fn = filter_pred_8;
else if (field_is_signed)
- fn = filter_pred_s8;
+ fn = pred_funcs_s8[op - PRED_FUNC_START];
else
- fn = filter_pred_u8;
+ fn = pred_funcs_u8[op - PRED_FUNC_START];
break;
}
@@ -1166,7 +1197,8 @@ static inline int append_operand_char(struct filter_parse_state *ps, char c)
return 0;
}
-static int filter_opstack_push(struct filter_parse_state *ps, int op)
+static int filter_opstack_push(struct filter_parse_state *ps,
+ enum filter_op_ids op)
{
struct opstack_op *opstack_op;
@@ -1200,7 +1232,7 @@ static int filter_opstack_top(struct filter_parse_state *ps)
static int filter_opstack_pop(struct filter_parse_state *ps)
{
struct opstack_op *opstack_op;
- int op;
+ enum filter_op_ids op;
if (filter_opstack_empty(ps))
return OP_NONE;
@@ -1245,7 +1277,7 @@ static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
return 0;
}
-static int postfix_append_op(struct filter_parse_state *ps, int op)
+static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op)
{
struct postfix_elt *elt;
@@ -1275,8 +1307,8 @@ static void postfix_clear(struct filter_parse_state *ps)
static int filter_parse(struct filter_parse_state *ps)
{
+ enum filter_op_ids op, top_op;
int in_string = 0;
- int op, top_op;
char ch;
while ((ch = infix_next(ps))) {
@@ -1367,7 +1399,8 @@ parse_operand:
static struct filter_pred *create_pred(struct filter_parse_state *ps,
struct trace_event_call *call,
- int op, char *operand1, char *operand2)
+ enum filter_op_ids op,
+ char *operand1, char *operand2)
{
struct ftrace_event_field *field;
static struct filter_pred pred;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4e480e870474..d56123cdcc89 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,7 +65,7 @@ struct fgraph_data {
#define TRACE_GRAPH_INDENT 2
-static unsigned int max_depth;
+unsigned int fgraph_max_depth;
static struct tracer_opt trace_opts[] = {
/* Display overruns? (for self-debug purpose) */
@@ -358,7 +358,7 @@ int __trace_graph_entry(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->graph_ent = *trace;
if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
return 1;
}
@@ -384,10 +384,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (!ftrace_trace_task(tr))
return 0;
- /* trace it when it is-nested-in or is a function enabled. */
- if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
- ftrace_graph_ignore_irqs()) || (trace->depth < 0) ||
- (max_depth && trace->depth >= max_depth))
+ if (ftrace_graph_ignore_func(trace))
+ return 0;
+
+ if (ftrace_graph_ignore_irqs())
return 0;
/*
@@ -469,7 +469,7 @@ void __trace_graph_return(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->ret = *trace;
if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -842,6 +842,10 @@ print_graph_entry_leaf(struct trace_iterator *iter,
cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+ /* If a graph tracer ignored set_graph_notrace */
+ if (call->depth < -1)
+ call->depth += FTRACE_NOTRACE_DEPTH;
+
/*
* Comments display at + 1 to depth. Since
* this is a leaf function, keep the comments
@@ -850,7 +854,8 @@ print_graph_entry_leaf(struct trace_iterator *iter,
cpu_data->depth = call->depth - 1;
/* No need to keep this function around for this depth */
- if (call->depth < FTRACE_RETFUNC_DEPTH)
+ if (call->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(call->depth < 0))
cpu_data->enter_funcs[call->depth] = 0;
}
@@ -880,11 +885,16 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct fgraph_cpu_data *cpu_data;
int cpu = iter->cpu;
+ /* If a graph tracer ignored set_graph_notrace */
+ if (call->depth < -1)
+ call->depth += FTRACE_NOTRACE_DEPTH;
+
cpu_data = per_cpu_ptr(data->cpu_data, cpu);
cpu_data->depth = call->depth;
/* Save this function pointer to see if the exit matches */
- if (call->depth < FTRACE_RETFUNC_DEPTH)
+ if (call->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(call->depth < 0))
cpu_data->enter_funcs[call->depth] = call->func;
}
@@ -1114,7 +1124,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
*/
cpu_data->depth = trace->depth - 1;
- if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+ if (trace->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(trace->depth < 0)) {
if (cpu_data->enter_funcs[trace->depth] != trace->func)
func_match = 0;
cpu_data->enter_funcs[trace->depth] = 0;
@@ -1489,7 +1500,7 @@ graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
- max_depth = val;
+ fgraph_max_depth = val;
*ppos += cnt;
@@ -1503,7 +1514,7 @@ graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
int n;
- n = sprintf(buf, "%d\n", max_depth);
+ n = sprintf(buf, "%d\n", fgraph_max_depth);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
}
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index b97286c48735..775569ec50d0 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -127,7 +127,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
entry->nmi_count = sample->nmi_count;
if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/* Macros to encapsulate the time capturing infrastructure */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 03cdff84d026..7758bc0617cb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -175,6 +175,18 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
int ret;
int pc;
+ if (ftrace_graph_ignore_func(trace))
+ return 0;
+ /*
+ * Do not trace a function if it's filtered by set_graph_notrace.
+ * Make the index of ret stack negative to indicate that it should
+ * ignore further functions. But it needs its own ret stack entry
+ * to recover the original index in order to continue tracing after
+ * returning from the function.
+ */
+ if (ftrace_graph_notrace_addr(trace->func))
+ return 1;
+
if (!func_prolog_dec(tr, &data, &flags))
return 0;
@@ -286,7 +298,7 @@ static void irqsoff_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static bool report_latency(struct trace_array *tr, cycle_t delta)
+static bool report_latency(struct trace_array *tr, u64 delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
@@ -304,7 +316,7 @@ check_critical_timing(struct trace_array *tr,
unsigned long parent_ip,
int cpu)
{
- cycle_t T0, T1, delta;
+ u64 T0, T1, delta;
unsigned long flags;
int pc;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index eb6c9f1d3a93..a133ecd741e4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -73,6 +73,17 @@ static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
return !!strchr(trace_kprobe_symbol(tk), ':');
}
+static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
+{
+ unsigned long nhit = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ nhit += *per_cpu_ptr(tk->nhit, cpu);
+
+ return nhit;
+}
+
static int register_kprobe_event(struct trace_kprobe *tk);
static int unregister_kprobe_event(struct trace_kprobe *tk);
@@ -882,14 +893,10 @@ static const struct file_operations kprobe_events_ops = {
static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct trace_kprobe *tk = v;
- unsigned long nhit = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- nhit += *per_cpu_ptr(tk->nhit, cpu);
seq_printf(m, " %-44s %15lu %15lu\n",
- trace_event_name(&tk->tp.call), nhit,
+ trace_event_name(&tk->tp.call),
+ trace_kprobe_nhit(tk),
tk->rp.kp.nmissed);
return 0;
@@ -1354,18 +1361,18 @@ fs_initcall(init_kprobe_trace);
#ifdef CONFIG_FTRACE_STARTUP_TEST
-
/*
* The "__used" keeps gcc from removing the function symbol
- * from the kallsyms table.
+ * from the kallsyms table. 'noinline' makes sure that there
+ * isn't an inlined version used by the test method below
*/
-static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
- int a4, int a5, int a6)
+static __used __init noinline int
+kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6)
{
return a1 + a2 + a3 + a4 + a5 + a6;
}
-static struct trace_event_file *
+static struct __init trace_event_file *
find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
{
struct trace_event_file *file;
@@ -1443,12 +1450,25 @@ static __init int kprobe_trace_self_tests_init(void)
ret = target(1, 2, 3, 4, 5, 6);
+ /*
+ * Not expecting an error here, the check is only to prevent the
+ * optimizer from removing the call to target() as otherwise there
+ * are no side-effects and the call is never performed.
+ */
+ if (ret != 21)
+ warn++;
+
/* Disable trace points before removing it */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
if (WARN_ON_ONCE(tk == NULL)) {
pr_warn("error on getting test probe.\n");
warn++;
} else {
+ if (trace_kprobe_nhit(tk) != 1) {
+ pr_warn("incorrect number of testprobe hits\n");
+ warn++;
+ }
+
file = find_trace_probe_file(tk, top_trace_array());
if (WARN_ON_ONCE(file == NULL)) {
pr_warn("error on getting probe file.\n");
@@ -1462,6 +1482,11 @@ static __init int kprobe_trace_self_tests_init(void)
pr_warn("error on getting 2nd test probe.\n");
warn++;
} else {
+ if (trace_kprobe_nhit(tk) != 1) {
+ pr_warn("incorrect number of testprobe2 hits\n");
+ warn++;
+ }
+
file = find_trace_probe_file(tk, top_trace_array());
if (WARN_ON_ONCE(file == NULL)) {
pr_warn("error on getting probe file.\n");
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 3fc20422c166..5d33a7352919 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1288,6 +1288,35 @@ static struct trace_event trace_print_event = {
.funcs = &trace_print_funcs,
};
+static enum print_line_t trace_raw_data(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct raw_data_entry *field;
+ int i;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "# %x buf:", field->id);
+
+ for (i = 0; i < iter->ent_size - offsetof(struct raw_data_entry, buf); i++)
+ trace_seq_printf(&iter->seq, " %02x",
+ (unsigned char)field->buf[i]);
+
+ trace_seq_putc(&iter->seq, '\n');
+
+ return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions trace_raw_data_funcs = {
+ .trace = trace_raw_data,
+ .raw = trace_raw_data,
+};
+
+static struct trace_event trace_raw_data_event = {
+ .type = TRACE_RAW_DATA,
+ .funcs = &trace_raw_data_funcs,
+};
+
static struct trace_event *events[] __initdata = {
&trace_fn_event,
@@ -1299,6 +1328,7 @@ static struct trace_event *events[] __initdata = {
&trace_bprint_event,
&trace_print_event,
&trace_hwlat_event,
+ &trace_raw_data_event,
NULL
};
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9d4399b553a3..ddec53b67646 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -239,6 +239,18 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
unsigned long flags;
int pc, ret = 0;
+ if (ftrace_graph_ignore_func(trace))
+ return 0;
+ /*
+ * Do not trace a function if it's filtered by set_graph_notrace.
+ * Make the index of ret stack negative to indicate that it should
+ * ignore further functions. But it needs its own ret stack entry
+ * to recover the original index in order to continue tracing after
+ * returning from the function.
+ */
+ if (ftrace_graph_notrace_addr(trace->func))
+ return 1;
+
if (!func_prolog_preempt_disable(tr, &data, &pc))
return 0;
@@ -346,7 +358,7 @@ static void wakeup_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static bool report_latency(struct trace_array *tr, cycle_t delta)
+static bool report_latency(struct trace_array *tr, u64 delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
@@ -428,7 +440,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
- cycle_t T0, T1, delta;
+ u64 T0, T1, delta;
unsigned long flags;
long disabled;
int cpu;
@@ -790,6 +802,7 @@ static struct tracer wakeup_dl_tracer __read_mostly =
#endif
.open = wakeup_trace_open,
.close = wakeup_trace_close,
+ .allow_instances = true,
.use_max_tr = true,
};
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d0639d917899..1f9a31f934a4 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -194,9 +194,13 @@ static int tracepoint_add_func(struct tracepoint *tp,
struct tracepoint_func *func, int prio)
{
struct tracepoint_func *old, *tp_funcs;
+ int ret;
- if (tp->regfunc && !static_key_enabled(&tp->key))
- tp->regfunc();
+ if (tp->regfunc && !static_key_enabled(&tp->key)) {
+ ret = tp->regfunc();
+ if (ret < 0)
+ return ret;
+ }
tp_funcs = rcu_dereference_protected(tp->funcs,
lockdep_is_held(&tracepoints_mutex));
@@ -529,7 +533,7 @@ EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint);
/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
static int sys_tracepoint_refcount;
-void syscall_regfunc(void)
+int syscall_regfunc(void)
{
struct task_struct *p, *t;
@@ -541,6 +545,8 @@ void syscall_regfunc(void)
read_unlock(&tasklist_lock);
}
sys_tracepoint_refcount++;
+
+ return 0;
}
void syscall_unregfunc(void)
diff --git a/kernel/uid16.c b/kernel/uid16.c
index cc40793464e3..71645ae9303a 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -14,7 +14,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9acb29f280ec..d4b0fa01cae3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,32 +24,14 @@
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
-#include <linux/perf_event.h>
#include <linux/kthread.h>
-/*
- * The run state of the lockup detectors is controlled by the content of the
- * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
- * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
- *
- * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
- * are variables that are only used as an 'interface' between the parameters
- * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
- * 'watchdog_thresh' variable is handled differently because its value is not
- * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
- * is equal zero.
- */
-#define NMI_WATCHDOG_ENABLED_BIT 0
-#define SOFT_WATCHDOG_ENABLED_BIT 1
-#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
-#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
-
static DEFINE_MUTEX(watchdog_proc_mutex);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
#else
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
#endif
int __read_mostly nmi_watchdog_enabled;
int __read_mostly soft_watchdog_enabled;
@@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10;
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#else
-#define sysctl_softlockup_all_cpu_backtrace 0
-#define sysctl_hardlockup_all_cpu_backtrace 0
#endif
static struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
-static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-#endif
static unsigned long soft_lockup_nmi_warn;
-/* boot commands */
-/*
- * Should we panic when a soft-lockup or hard-lockup occurs:
- */
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-unsigned int __read_mostly hardlockup_panic =
- CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-static unsigned long hardlockup_allcpu_dumped;
-/*
- * We may not want to enable hard lockup detection by default in all cases,
- * for example when running the kernel as a guest on a hypervisor. In these
- * cases this function can be called to disable hard lockup detection. This
- * function should only be executed once by the boot processor before the
- * kernel command line parameters are parsed, because otherwise it is not
- * possible to override this in hardlockup_panic_setup().
- */
-void hardlockup_detector_disable(void)
-{
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-}
-
-static int __init hardlockup_panic_setup(char *str)
-{
- if (!strncmp(str, "panic", 5))
- hardlockup_panic = 1;
- else if (!strncmp(str, "nopanic", 7))
- hardlockup_panic = 0;
- else if (!strncmp(str, "0", 1))
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
- else if (!strncmp(str, "1", 1))
- watchdog_enabled |= NMI_WATCHDOG_ENABLED;
- return 1;
-}
-__setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif
-
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void)
wq_watchdog_touch(-1);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void touch_nmi_watchdog(void)
-{
- /*
- * Using __raw here because some code paths have
- * preemption enabled. If preemption is enabled
- * then interrupts should be enabled too, in which
- * case we shouldn't have to worry about the watchdog
- * going off.
- */
- raw_cpu_write(watchdog_nmi_touch, true);
- touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-#endif
-
void touch_softlockup_watchdog_sync(void)
{
__this_cpu_write(softlockup_touch_sync, true);
__this_cpu_write(watchdog_touch_ts, 0);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
/* watchdog detector functions */
-static bool is_hardlockup(void)
+bool is_hardlockup(void)
{
unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
@@ -299,7 +219,6 @@ static bool is_hardlockup(void)
__this_cpu_write(hrtimer_interrupts_saved, hrint);
return false;
}
-#endif
static int is_softlockup(unsigned long touch_ts)
{
@@ -313,78 +232,22 @@ static int is_softlockup(unsigned long touch_ts)
return 0;
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-
-static struct perf_event_attr wd_hw_attr = {
- .type = PERF_TYPE_HARDWARE,
- .config = PERF_COUNT_HW_CPU_CYCLES,
- .size = sizeof(struct perf_event_attr),
- .pinned = 1,
- .disabled = 1,
-};
-
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- /* Ensure the watchdog never gets throttled */
- event->hw.interrupts = 0;
-
- if (__this_cpu_read(watchdog_nmi_touch) == true) {
- __this_cpu_write(watchdog_nmi_touch, false);
- return;
- }
-
- /* check for a hardlockup
- * This is done by making sure our timer interrupt
- * is incrementing. The timer interrupt should have
- * fired multiple times before we overflow'd. If it hasn't
- * then this is a good indication the cpu is stuck
- */
- if (is_hardlockup()) {
- int this_cpu = smp_processor_id();
- struct pt_regs *regs = get_irq_regs();
-
- /* only print hardlockups once */
- if (__this_cpu_read(hard_watchdog_warn) == true)
- return;
-
- pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- print_modules();
- print_irqtrace_events(current);
- if (regs)
- show_regs(regs);
- else
- dump_stack();
-
- /*
- * Perform all-CPU dump only once to avoid multiple hardlockups
- * generating interleaving traces
- */
- if (sysctl_hardlockup_all_cpu_backtrace &&
- !test_and_set_bit(0, &hardlockup_allcpu_dumped))
- trigger_allbutself_cpu_backtrace();
-
- if (hardlockup_panic)
- nmi_panic(regs, "Hard LOCKUP");
-
- __this_cpu_write(hard_watchdog_warn, true);
- return;
- }
-
- __this_cpu_write(hard_watchdog_warn, false);
- return;
-}
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-
static void watchdog_interrupt_count(void)
{
__this_cpu_inc(hrtimer_interrupts);
}
-static int watchdog_nmi_enable(unsigned int cpu);
-static void watchdog_nmi_disable(unsigned int cpu);
+/*
+ * These two functions are mostly architecture specific
+ * defining them as weak here.
+ */
+int __weak watchdog_nmi_enable(unsigned int cpu)
+{
+ return 0;
+}
+void __weak watchdog_nmi_disable(unsigned int cpu)
+{
+}
static int watchdog_enable_all_cpus(void);
static void watchdog_disable_all_cpus(void);
@@ -577,109 +440,6 @@ static void watchdog(unsigned int cpu)
watchdog_nmi_disable(cpu);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long cpu0_err;
-
-static int watchdog_nmi_enable(unsigned int cpu)
-{
- struct perf_event_attr *wd_attr;
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
- /* nothing to do if the hard lockup detector is disabled */
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- goto out;
-
- /* is it already setup and enabled? */
- if (event && event->state > PERF_EVENT_STATE_OFF)
- goto out;
-
- /* it is setup but not enabled */
- if (event != NULL)
- goto out_enable;
-
- wd_attr = &wd_hw_attr;
- wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-
- /* Try to register using hardware perf events */
- event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-
- /* save cpu0 error for future comparision */
- if (cpu == 0 && IS_ERR(event))
- cpu0_err = PTR_ERR(event);
-
- if (!IS_ERR(event)) {
- /* only print for cpu0 or different than cpu0 */
- if (cpu == 0 || cpu0_err)
- pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
- goto out_save;
- }
-
- /*
- * Disable the hard lockup detector if _any_ CPU fails to set up
- * set up the hardware perf event. The watchdog() function checks
- * the NMI_WATCHDOG_ENABLED bit periodically.
- *
- * The barriers are for syncing up watchdog_enabled across all the
- * cpus, as clear_bit() does not use barriers.
- */
- smp_mb__before_atomic();
- clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
- smp_mb__after_atomic();
-
- /* skip displaying the same error again */
- if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
- return PTR_ERR(event);
-
- /* vary the KERN level based on the returned errno */
- if (PTR_ERR(event) == -EOPNOTSUPP)
- pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
- else if (PTR_ERR(event) == -ENOENT)
- pr_warn("disabled (cpu%i): hardware events not enabled\n",
- cpu);
- else
- pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
- cpu, PTR_ERR(event));
-
- pr_info("Shutting down hard lockup detector on all cpus\n");
-
- return PTR_ERR(event);
-
- /* success path */
-out_save:
- per_cpu(watchdog_ev, cpu) = event;
-out_enable:
- perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
- return 0;
-}
-
-static void watchdog_nmi_disable(unsigned int cpu)
-{
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
- if (event) {
- perf_event_disable(event);
- per_cpu(watchdog_ev, cpu) = NULL;
-
- /* should be in cleanup, but blocks oprofile */
- perf_event_release_kernel(event);
- }
- if (cpu == 0) {
- /* watchdog_nmi_enable() expects this to be zero initially. */
- cpu0_err = 0;
- }
-}
-
-#else
-static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
-static void watchdog_nmi_disable(unsigned int cpu) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-
static struct smp_hotplug_thread watchdog_threads = {
.store = &softlockup_watchdog,
.thread_should_run = watchdog_should_run,
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
new file mode 100644
index 000000000000..84016c8aee6b
--- /dev/null
+++ b/kernel/watchdog_hld.c
@@ -0,0 +1,227 @@
+/*
+ * Detect hard lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Note: Most of this code is borrowed heavily from the original softlockup
+ * detector, so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
+ * to those contributors as well.
+ */
+
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+unsigned int __read_mostly hardlockup_panic =
+ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
+
+static int __init hardlockup_panic_setup(char *str)
+{
+ if (!strncmp(str, "panic", 5))
+ hardlockup_panic = 1;
+ else if (!strncmp(str, "nopanic", 7))
+ hardlockup_panic = 0;
+ else if (!strncmp(str, "0", 1))
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ else if (!strncmp(str, "1", 1))
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+
+void touch_nmi_watchdog(void)
+{
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ raw_cpu_write(watchdog_nmi_touch, true);
+ touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+static struct perf_event_attr wd_hw_attr = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 1,
+};
+
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ /* Ensure the watchdog never gets throttled */
+ event->hw.interrupts = 0;
+
+ if (__this_cpu_read(watchdog_nmi_touch) == true) {
+ __this_cpu_write(watchdog_nmi_touch, false);
+ return;
+ }
+
+ /* check for a hardlockup
+ * This is done by making sure our timer interrupt
+ * is incrementing. The timer interrupt should have
+ * fired multiple times before we overflow'd. If it hasn't
+ * then this is a good indication the cpu is stuck
+ */
+ if (is_hardlockup()) {
+ int this_cpu = smp_processor_id();
+
+ /* only print hardlockups once */
+ if (__this_cpu_read(hard_watchdog_warn) == true)
+ return;
+
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+
+ /*
+ * Perform all-CPU dump only once to avoid multiple hardlockups
+ * generating interleaving traces
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace &&
+ !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+ trigger_allbutself_cpu_backtrace();
+
+ if (hardlockup_panic)
+ nmi_panic(regs, "Hard LOCKUP");
+
+ __this_cpu_write(hard_watchdog_warn, true);
+ return;
+ }
+
+ __this_cpu_write(hard_watchdog_warn, false);
+ return;
+}
+
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
+
+int watchdog_nmi_enable(unsigned int cpu)
+{
+ struct perf_event_attr *wd_attr;
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ /* nothing to do if the hard lockup detector is disabled */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ goto out;
+
+ /* is it already setup and enabled? */
+ if (event && event->state > PERF_EVENT_STATE_OFF)
+ goto out;
+
+ /* it is setup but not enabled */
+ if (event != NULL)
+ goto out_enable;
+
+ wd_attr = &wd_hw_attr;
+ wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+
+ /* Try to register using hardware perf events */
+ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+
+ /* save cpu0 error for future comparision */
+ if (cpu == 0 && IS_ERR(event))
+ cpu0_err = PTR_ERR(event);
+
+ if (!IS_ERR(event)) {
+ /* only print for cpu0 or different than cpu0 */
+ if (cpu == 0 || cpu0_err)
+ pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
+ goto out_save;
+ }
+
+ /*
+ * Disable the hard lockup detector if _any_ CPU fails to set up
+ * set up the hardware perf event. The watchdog() function checks
+ * the NMI_WATCHDOG_ENABLED bit periodically.
+ *
+ * The barriers are for syncing up watchdog_enabled across all the
+ * cpus, as clear_bit() does not use barriers.
+ */
+ smp_mb__before_atomic();
+ clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+ smp_mb__after_atomic();
+
+ /* skip displaying the same error again */
+ if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+ return PTR_ERR(event);
+
+ /* vary the KERN level based on the returned errno */
+ if (PTR_ERR(event) == -EOPNOTSUPP)
+ pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+ else if (PTR_ERR(event) == -ENOENT)
+ pr_warn("disabled (cpu%i): hardware events not enabled\n",
+ cpu);
+ else
+ pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+ cpu, PTR_ERR(event));
+
+ pr_info("Shutting down hard lockup detector on all cpus\n");
+
+ return PTR_ERR(event);
+
+ /* success path */
+out_save:
+ per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+ perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+ return 0;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event) {
+ perf_event_disable(event);
+ per_cpu(watchdog_ev, cpu) = NULL;
+
+ /* should be in cleanup, but blocks oprofile */
+ perf_event_release_kernel(event);
+ }
+ if (cpu == 0) {
+ /* watchdog_nmi_enable() expects this to be zero initially. */
+ cpu0_err = 0;
+ }
+}