summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/dma/debug.c2
-rw-r--r--kernel/events/core.c89
-rw-r--r--kernel/events/ring_buffer.c37
-rw-r--r--kernel/irq/chip.c4
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/locking/lockdep.c20
-rw-r--r--kernel/sched/deadline.c3
-rw-r--r--kernel/sched/fair.c31
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/sched_clock.c4
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/timekeeping.h7
-rw-r--r--kernel/trace/ftrace.c6
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--kernel/watchdog_hld.c3
17 files changed, 151 insertions, 74 deletions
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 45d51e8e26f6..a218e43cc382 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -706,7 +706,7 @@ static struct dma_debug_entry *dma_entry_alloc(void)
#ifdef CONFIG_STACKTRACE
entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
entry->stacktrace.entries = entry->st_entries;
- entry->stacktrace.skip = 2;
+ entry->stacktrace.skip = 1;
save_stack_trace(&entry->stacktrace);
#endif
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 72d06e302e99..dc7dead2d2cc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2009,8 +2009,8 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0);
event->oncpu = -1;
- if (event->pending_disable) {
- event->pending_disable = 0;
+ if (READ_ONCE(event->pending_disable) >= 0) {
+ WRITE_ONCE(event->pending_disable, -1);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
@@ -2198,7 +2198,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
void perf_event_disable_inatomic(struct perf_event *event)
{
- event->pending_disable = 1;
+ WRITE_ONCE(event->pending_disable, smp_processor_id());
+ /* can fail, see perf_pending_event_disable() */
irq_work_queue(&event->pending);
}
@@ -5810,10 +5811,45 @@ void perf_event_wakeup(struct perf_event *event)
}
}
+static void perf_pending_event_disable(struct perf_event *event)
+{
+ int cpu = READ_ONCE(event->pending_disable);
+
+ if (cpu < 0)
+ return;
+
+ if (cpu == smp_processor_id()) {
+ WRITE_ONCE(event->pending_disable, -1);
+ perf_event_disable_local(event);
+ return;
+ }
+
+ /*
+ * CPU-A CPU-B
+ *
+ * perf_event_disable_inatomic()
+ * @pending_disable = CPU-A;
+ * irq_work_queue();
+ *
+ * sched-out
+ * @pending_disable = -1;
+ *
+ * sched-in
+ * perf_event_disable_inatomic()
+ * @pending_disable = CPU-B;
+ * irq_work_queue(); // FAILS
+ *
+ * irq_work_run()
+ * perf_pending_event()
+ *
+ * But the event runs on CPU-B and wants disabling there.
+ */
+ irq_work_queue_on(&event->pending, cpu);
+}
+
static void perf_pending_event(struct irq_work *entry)
{
- struct perf_event *event = container_of(entry,
- struct perf_event, pending);
+ struct perf_event *event = container_of(entry, struct perf_event, pending);
int rctx;
rctx = perf_swevent_get_recursion_context();
@@ -5822,10 +5858,7 @@ static void perf_pending_event(struct irq_work *entry)
* and we won't recurse 'further'.
*/
- if (event->pending_disable) {
- event->pending_disable = 0;
- perf_event_disable_local(event);
- }
+ perf_pending_event_disable(event);
if (event->pending_wakeup) {
event->pending_wakeup = 0;
@@ -9044,26 +9077,29 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
if (task == TASK_TOMBSTONE)
return;
- if (!ifh->nr_file_filters)
- return;
-
- mm = get_task_mm(event->ctx->task);
- if (!mm)
- goto restart;
+ if (ifh->nr_file_filters) {
+ mm = get_task_mm(event->ctx->task);
+ if (!mm)
+ goto restart;
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_sem);
+ }
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- event->addr_filter_ranges[count].start = 0;
- event->addr_filter_ranges[count].size = 0;
+ if (filter->path.dentry) {
+ /*
+ * Adjust base offset if the filter is associated to a
+ * binary that needs to be mapped:
+ */
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
- /*
- * Adjust base offset if the filter is associated to a binary
- * that needs to be mapped:
- */
- if (filter->path.dentry)
perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
+ } else {
+ event->addr_filter_ranges[count].start = filter->offset;
+ event->addr_filter_ranges[count].size = filter->size;
+ }
count++;
}
@@ -9071,9 +9107,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
event->addr_filters_gen++;
raw_spin_unlock_irqrestore(&ifh->lock, flags);
- up_read(&mm->mmap_sem);
+ if (ifh->nr_file_filters) {
+ up_read(&mm->mmap_sem);
- mmput(mm);
+ mmput(mm);
+ }
restart:
perf_event_stop(event, 1);
@@ -10236,6 +10274,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
+ event->pending_disable = -1;
init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a4047321d7d8..5eedb49a65ea 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -392,7 +392,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* store that will be enabled on successful return
*/
if (!handle->size) { /* A, matches D */
- event->pending_disable = 1;
+ event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
local_set(&rb->aux_nest, 0);
goto err_put;
@@ -455,24 +455,21 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
rb->aux_head += size;
}
- if (size || handle->aux_flags) {
- /*
- * Only send RECORD_AUX if we have something useful to communicate
- *
- * Note: the OVERWRITE records by themselves are not considered
- * useful, as they don't communicate any *new* information,
- * aside from the short-lived offset, that becomes history at
- * the next event sched-in and therefore isn't useful.
- * The userspace that needs to copy out AUX data in overwrite
- * mode should know to use user_page::aux_head for the actual
- * offset. So, from now on we don't output AUX records that
- * have *only* OVERWRITE flag set.
- */
-
- if (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)
- perf_event_aux_event(handle->event, aux_head, size,
- handle->aux_flags);
- }
+ /*
+ * Only send RECORD_AUX if we have something useful to communicate
+ *
+ * Note: the OVERWRITE records by themselves are not considered
+ * useful, as they don't communicate any *new* information,
+ * aside from the short-lived offset, that becomes history at
+ * the next event sched-in and therefore isn't useful.
+ * The userspace that needs to copy out AUX data in overwrite
+ * mode should know to use user_page::aux_head for the actual
+ * offset. So, from now on we don't output AUX records that
+ * have *only* OVERWRITE flag set.
+ */
+ if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
+ perf_event_aux_event(handle->event, aux_head, size,
+ handle->aux_flags);
rb->user_page->aux_head = rb->aux_head;
if (rb_need_aux_wakeup(rb))
@@ -480,7 +477,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
- handle->event->pending_disable = 1;
+ handle->event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3faef4a77f71..51128bea3846 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1449,6 +1449,10 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
{
data = data->parent_data;
+
+ if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE)
+ return 0;
+
if (data->chip->irq_set_wake)
return data->chip->irq_set_wake(data, on);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 13539e12cd80..9f8a709337cf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -558,6 +558,7 @@ int __init early_irq_init(void)
alloc_masks(&desc[i], node);
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+ mutex_init(&desc[i].request_mutex);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c83e54727131..b1ea30a5540e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -709,7 +709,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
static int reuse_unused_kprobe(struct kprobe *ap)
{
struct optimized_kprobe *op;
- int ret;
/*
* Unused kprobe MUST be on the way of delayed unoptimizing (means
@@ -720,9 +719,8 @@ static int reuse_unused_kprobe(struct kprobe *ap)
/* Enable the probe again */
ap->flags &= ~KPROBE_FLAG_DISABLED;
/* Optimize it again (remove from op->list) */
- ret = kprobe_optready(ap);
- if (ret)
- return ret;
+ if (!kprobe_optready(ap))
+ return -EINVAL;
optimize_kprobe(ap);
return 0;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 34cdcbedda49..e221be724fe8 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4689,8 +4689,8 @@ static void free_zapped_rcu(struct rcu_head *ch)
return;
raw_local_irq_save(flags);
- if (!graph_lock())
- goto out_irq;
+ arch_spin_lock(&lockdep_lock);
+ current->lockdep_recursion = 1;
/* closed head */
pf = delayed_free.pf + (delayed_free.index ^ 1);
@@ -4702,8 +4702,8 @@ static void free_zapped_rcu(struct rcu_head *ch)
*/
call_rcu_zapped(delayed_free.pf + delayed_free.index);
- graph_unlock();
-out_irq:
+ current->lockdep_recursion = 0;
+ arch_spin_unlock(&lockdep_lock);
raw_local_irq_restore(flags);
}
@@ -4744,21 +4744,17 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)
{
struct pending_free *pf;
unsigned long flags;
- int locked;
init_data_structures_once();
raw_local_irq_save(flags);
- locked = graph_lock();
- if (!locked)
- goto out_irq;
-
+ arch_spin_lock(&lockdep_lock);
+ current->lockdep_recursion = 1;
pf = get_pending_free();
__lockdep_free_key_range(pf, start, size);
call_rcu_zapped(pf);
-
- graph_unlock();
-out_irq:
+ current->lockdep_recursion = 0;
+ arch_spin_unlock(&lockdep_lock);
raw_local_irq_restore(flags);
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a73e41a2016..43901fa3f269 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p)
if (dl_entity_is_special(dl_se))
return;
- WARN_ON(hrtimer_active(&dl_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
zerolag_time = dl_se->deadline -
@@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p)
* If the "0-lag time" already passed, decrease the active
* utilization now, instead of starting a timer
*/
- if (zerolag_time < 0) {
+ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
if (dl_task(p))
sub_running_bw(dl_se, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fdab7eb6f351..a4d9e14bf138 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+extern const u64 max_cfs_quota_period;
+
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -4892,6 +4894,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
unsigned long flags;
int overrun;
int idle = 0;
+ int count = 0;
raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
@@ -4899,6 +4902,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ if (++count > 3) {
+ u64 new, old = ktime_to_ns(cfs_b->period);
+
+ new = (old * 147) / 128; /* ~115% */
+ new = min(new, max_cfs_quota_period);
+
+ cfs_b->period = ns_to_ktime(new);
+
+ /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
+ cfs_b->quota *= new;
+ cfs_b->quota = div64_u64(cfs_b->quota, old);
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
+
idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
@@ -7784,10 +7809,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
if (cfs_rq->last_h_load_update == now)
return;
- cfs_rq->h_load_next = NULL;
+ WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_load_next = se;
+ WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
}
@@ -7797,7 +7822,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
cfs_rq->last_h_load_update = now;
}
- while ((se = cfs_rq->h_load_next) != NULL) {
+ while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
load = cfs_rq->h_load;
load = div64_ul(load * se->avg.load_avg,
cfs_rq_load_avg(cfs_rq) + 1);
diff --git a/kernel/signal.c b/kernel/signal.c
index f98448cf2def..227ba170298e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3581,7 +3581,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (flags)
return -EINVAL;
- f = fdget_raw(pidfd);
+ f = fdget(pidfd);
if (!f.file)
return -EBADF;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2c97e8c2d29f..0519a8805aab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -594,7 +594,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
- return ktime_sub(now, alarm->node.expires);
+ return ktime_sub(alarm->node.expires, now);
}
/**
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 094b82ca95e5..930113b9799a 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -272,7 +272,7 @@ static u64 notrace suspended_sched_clock_read(void)
return cd.read_data[seq & 1].epoch_cyc;
}
-static int sched_clock_suspend(void)
+int sched_clock_suspend(void)
{
struct clock_read_data *rd = &cd.read_data[0];
@@ -283,7 +283,7 @@ static int sched_clock_suspend(void)
return 0;
}
-static void sched_clock_resume(void)
+void sched_clock_resume(void)
{
struct clock_read_data *rd = &cd.read_data[0];
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 529143b4c8d2..df401463a191 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -487,6 +487,7 @@ void tick_freeze(void)
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), true);
system_state = SYSTEM_SUSPEND;
+ sched_clock_suspend();
timekeeping_suspend();
} else {
tick_suspend_local();
@@ -510,6 +511,7 @@ void tick_unfreeze(void)
if (tick_freeze_depth == num_online_cpus()) {
timekeeping_resume();
+ sched_clock_resume();
system_state = SYSTEM_RUNNING;
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 7a9b4eb7a1d5..141ab3ab0354 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -14,6 +14,13 @@ extern u64 timekeeping_max_deferment(void);
extern void timekeeping_warp_clock(void);
extern int timekeeping_suspend(void);
extern void timekeeping_resume(void);
+#ifdef CONFIG_GENERIC_SCHED_CLOCK
+extern int sched_clock_suspend(void);
+extern void sched_clock_resume(void);
+#else
+static inline int sched_clock_suspend(void) { return 0; }
+static inline void sched_clock_resume(void) { }
+#endif
extern void do_timer(unsigned long ticks);
extern void update_wall_time(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 26c8ca9bd06b..b920358dd8f7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -33,6 +33,7 @@
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/rcupdate.h>
+#include <linux/kprobes.h>
#include <trace/events/sched.h>
@@ -6246,7 +6247,7 @@ void ftrace_reset_array_ops(struct trace_array *tr)
tr->ops->func = ftrace_stub;
}
-static inline void
+static nokprobe_inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
{
@@ -6306,11 +6307,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
{
__ftrace_ops_list_func(ip, parent_ip, NULL, regs);
}
+NOKPROBE_SYMBOL(ftrace_ops_list_func);
#else
static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
{
__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
}
+NOKPROBE_SYMBOL(ftrace_ops_no_ops);
#endif
/*
@@ -6337,6 +6340,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
preempt_enable_notrace();
trace_clear_recursion(bit);
}
+NOKPROBE_SYMBOL(ftrace_ops_assist_func);
/**
* ftrace_ops_get_func - get the function a trampoline should call
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 21153e64bf1c..6c24755655c7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7041,12 +7041,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
buf->private = 0;
}
-static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+ if (ref->ref > INT_MAX/2)
+ return false;
+
ref->ref++;
+ return true;
}
/* Pipe buffer operations for a buffer. */
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 71381168dede..247bf0b1582c 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -135,7 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
+ this_cpu);
print_modules();
print_irqtrace_events(current);
if (regs)