summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2022-01-27 12:54:16 -0800
committerJakub Kicinski <kuba@kernel.org>2022-01-27 12:54:16 -0800
commit72d044e4bfa6bd9096536e2e1c62aecfe1a525e4 (patch)
tree0e408f181f5c704ff43cd006f4643c339e04aff1 /kernel
parente2cf07654efb0fd7bbcb475c6f74be7b5755a8fd (diff)
parent23a46422c56144939c091c76cf389aa863ce9c18 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
No conflicts. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/stackmap.c5
-rw-r--r--kernel/cgroup/cgroup.c11
-rw-r--r--kernel/events/core.c246
-rw-r--r--kernel/hung_task.c81
-rw-r--r--kernel/irq/proc.c8
-rw-r--r--kernel/kprobes.c30
-rw-r--r--kernel/locking/spinlock.c10
-rw-r--r--kernel/locking/spinlock_rt.c12
-rw-r--r--kernel/printk/Makefile5
-rw-r--r--kernel/printk/internal.h8
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/printk/sysctl.c85
-rw-r--r--kernel/resource.c4
-rw-r--r--kernel/sched/core.c15
-rw-r--r--kernel/sched/core_sched.c2
-rw-r--r--kernel/sched/fair.c118
-rw-r--r--kernel/sched/pelt.h4
-rw-r--r--kernel/sched/psi.c66
-rw-r--r--kernel/stackleak.c26
-rw-r--r--kernel/sysctl.c724
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/trace/Kconfig9
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/watchdog.c101
24 files changed, 689 insertions, 893 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 49e567209c6b..22c8ae94e4c1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -472,13 +472,14 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
u32, size, u64, flags)
{
struct pt_regs *regs;
- long res;
+ long res = -EINVAL;
if (!try_get_task_stack(task))
return -EFAULT;
regs = task_pt_regs(task);
- res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ if (regs)
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
put_task_stack(task);
return res;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b31e1465868a..9d05c3ca2d5e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3643,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
+ /* Allow only one trigger per file descriptor */
+ if (ctx->psi.trigger) {
+ cgroup_put(cgrp);
+ return -EBUSY;
+ }
+
psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) {
@@ -3650,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return PTR_ERR(new);
}
- psi_trigger_replace(&ctx->psi.trigger, new);
-
+ smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
@@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
- psi_trigger_replace(&ctx->psi.trigger, NULL);
+ psi_trigger_destroy(ctx->psi.trigger);
}
bool cgroup_psi_enabled(void)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc18664f49b0..479c9e672ec4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -674,6 +674,23 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state)
WRITE_ONCE(event->state, state);
}
+/*
+ * UP store-release, load-acquire
+ */
+
+#define __store_release(ptr, val) \
+do { \
+ barrier(); \
+ WRITE_ONCE(*(ptr), (val)); \
+} while (0)
+
+#define __load_acquire(ptr) \
+({ \
+ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \
+ barrier(); \
+ ___p; \
+})
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -719,34 +736,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
return t->time;
}
-static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
- struct perf_cgroup_info *info;
- u64 now;
-
- now = perf_clock();
+ struct perf_cgroup_info *t;
- info = this_cpu_ptr(cgrp->info);
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ if (!__load_acquire(&t->active))
+ return t->time;
+ now += READ_ONCE(t->timeoffset);
+ return now;
+}
- info->time += now - info->timestamp;
+static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
+{
+ if (adv)
+ info->time += now - info->timestamp;
info->timestamp = now;
+ /*
+ * see update_context_time()
+ */
+ WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
struct perf_cgroup *cgrp = cpuctx->cgrp;
struct cgroup_subsys_state *css;
+ struct perf_cgroup_info *info;
if (cgrp) {
+ u64 now = perf_clock();
+
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
- __update_cgrp_time(cgrp);
+ info = this_cpu_ptr(cgrp->info);
+
+ __update_cgrp_time(info, now, true);
+ if (final)
+ __store_release(&info->active, 0);
}
}
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
+ struct perf_cgroup_info *info;
struct perf_cgroup *cgrp;
/*
@@ -760,8 +794,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
/*
* Do not update time when cgroup is not active
*/
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
- __update_cgrp_time(event->cgrp);
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) {
+ info = this_cpu_ptr(event->cgrp->info);
+ __update_cgrp_time(info, perf_clock(), true);
+ }
}
static inline void
@@ -785,7 +821,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- info->timestamp = ctx->timestamp;
+ __update_cgrp_time(info, ctx->timestamp, false);
+ __store_release(&info->active, 1);
}
}
@@ -982,14 +1019,6 @@ out:
}
static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
-{
- struct perf_cgroup_info *t;
- t = per_cpu_ptr(event->cgrp->info, event->cpu);
- event->shadow_ctx_time = now - t->timestamp;
-}
-
-static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
@@ -1066,7 +1095,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
+ bool final)
{
}
@@ -1098,12 +1128,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
{
}
-static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
+ return 0;
}
-static inline u64 perf_cgroup_event_time(struct perf_event *event)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
return 0;
}
@@ -1525,22 +1555,59 @@ static void perf_unpin_context(struct perf_event_context *ctx)
/*
* Update the record of the current time in a context.
*/
-static void update_context_time(struct perf_event_context *ctx)
+static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
- ctx->time += now - ctx->timestamp;
+ if (adv)
+ ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
+
+ /*
+ * The above: time' = time + (now - timestamp), can be re-arranged
+ * into: time` = now + (time - timestamp), which gives a single value
+ * offset to compute future time without locks on.
+ *
+ * See perf_event_time_now(), which can be used from NMI context where
+ * it's (obviously) not possible to acquire ctx->lock in order to read
+ * both the above values in a consistent manner.
+ */
+ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
+}
+
+static void update_context_time(struct perf_event_context *ctx)
+{
+ __update_context_time(ctx, true);
}
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
+ if (unlikely(!ctx))
+ return 0;
+
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
- return ctx ? ctx->time : 0;
+ return ctx->time;
+}
+
+static u64 perf_event_time_now(struct perf_event *event, u64 now)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (unlikely(!ctx))
+ return 0;
+
+ if (is_cgroup_event(event))
+ return perf_cgroup_event_time_now(event, now);
+
+ if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
+ return ctx->time;
+
+ now += READ_ONCE(ctx->timeoffset);
+ return now;
}
static enum event_type_t get_event_type(struct perf_event *event)
@@ -2350,7 +2417,7 @@ __perf_remove_from_context(struct perf_event *event,
if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
+ update_cgrp_time_from_cpuctx(cpuctx, false);
}
event_sched_out(event, cpuctx, ctx);
@@ -2361,6 +2428,9 @@ __perf_remove_from_context(struct perf_event *event,
list_del_event(event, ctx);
if (!ctx->nr_events && ctx->is_active) {
+ if (ctx == &cpuctx->ctx)
+ update_cgrp_time_from_cpuctx(cpuctx, true);
+
ctx->is_active = 0;
ctx->rotate_necessary = 0;
if (ctx->task) {
@@ -2482,40 +2552,6 @@ void perf_event_disable_inatomic(struct perf_event *event)
irq_work_queue(&event->pending);
}
-static void perf_set_shadow_time(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- /*
- * use the correct time source for the time snapshot
- *
- * We could get by without this by leveraging the
- * fact that to get to this function, the caller
- * has most likely already called update_context_time()
- * and update_cgrp_time_xx() and thus both timestamp
- * are identical (or very close). Given that tstamp is,
- * already adjusted for cgroup, we could say that:
- * tstamp - ctx->timestamp
- * is equivalent to
- * tstamp - cgrp->timestamp.
- *
- * Then, in perf_output_read(), the calculation would
- * work with no changes because:
- * - event is guaranteed scheduled in
- * - no scheduled out in between
- * - thus the timestamp would be the same
- *
- * But this is a bit hairy.
- *
- * So instead, we have an explicit cgroup call to remain
- * within the time source all along. We believe it
- * is cleaner and simpler to understand.
- */
- if (is_cgroup_event(event))
- perf_cgroup_set_shadow_time(event, event->tstamp);
- else
- event->shadow_ctx_time = event->tstamp - ctx->timestamp;
-}
-
#define MAX_INTERRUPTS (~0ULL)
static void perf_log_throttle(struct perf_event *event, int enable);
@@ -2556,8 +2592,6 @@ event_sched_in(struct perf_event *event,
perf_pmu_disable(event->pmu);
- perf_set_shadow_time(event, ctx);
-
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
@@ -3251,16 +3285,6 @@ static void ctx_sched_out(struct perf_event_context *ctx,
return;
}
- ctx->is_active &= ~event_type;
- if (!(ctx->is_active & EVENT_ALL))
- ctx->is_active = 0;
-
- if (ctx->task) {
- WARN_ON_ONCE(cpuctx->task_ctx != ctx);
- if (!ctx->is_active)
- cpuctx->task_ctx = NULL;
- }
-
/*
* Always update time if it was set; not only when it changes.
* Otherwise we can 'forget' to update time for any but the last
@@ -3274,7 +3298,22 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (is_active & EVENT_TIME) {
/* update (and stop) ctx time */
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
+ update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ }
+
+ ctx->is_active &= ~event_type;
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ if (!ctx->is_active)
+ cpuctx->task_ctx = NULL;
}
is_active ^= ctx->is_active; /* changed bits */
@@ -3711,13 +3750,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
return 0;
}
+/*
+ * Because the userpage is strictly per-event (there is no concept of context,
+ * so there cannot be a context indirection), every userpage must be updated
+ * when context time starts :-(
+ *
+ * IOW, we must not miss EVENT_TIME edges.
+ */
static inline bool event_update_userpage(struct perf_event *event)
{
if (likely(!atomic_read(&event->mmap_count)))
return false;
perf_event_update_time(event);
- perf_set_shadow_time(event, event->ctx);
perf_event_update_userpage(event);
return true;
@@ -3801,13 +3846,23 @@ ctx_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
int is_active = ctx->is_active;
- u64 now;
lockdep_assert_held(&ctx->lock);
if (likely(!ctx->nr_events))
return;
+ if (is_active ^ EVENT_TIME) {
+ /* start ctx time */
+ __update_context_time(ctx, false);
+ perf_cgroup_set_timestamp(task, ctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ }
+
ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
if (!is_active)
@@ -3818,13 +3873,6 @@ ctx_sched_in(struct perf_event_context *ctx,
is_active ^= ctx->is_active; /* changed bits */
- if (is_active & EVENT_TIME) {
- /* start ctx time */
- now = perf_clock();
- ctx->timestamp = now;
- perf_cgroup_set_timestamp(task, ctx);
- }
-
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
@@ -4418,6 +4466,18 @@ static inline u64 perf_event_count(struct perf_event *event)
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
+static void calc_timer_values(struct perf_event *event,
+ u64 *now,
+ u64 *enabled,
+ u64 *running)
+{
+ u64 ctx_time;
+
+ *now = perf_clock();
+ ctx_time = perf_event_time_now(event, *now);
+ __perf_update_times(event, ctx_time, enabled, running);
+}
+
/*
* NMI-safe method to read a local event, that is an event that
* is:
@@ -4477,10 +4537,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
*value = local64_read(&event->count);
if (enabled || running) {
- u64 now = event->shadow_ctx_time + perf_clock();
- u64 __enabled, __running;
+ u64 __enabled, __running, __now;;
- __perf_update_times(event, now, &__enabled, &__running);
+ calc_timer_values(event, &__now, &__enabled, &__running);
if (enabled)
*enabled = __enabled;
if (running)
@@ -5802,18 +5861,6 @@ static int perf_event_index(struct perf_event *event)
return event->pmu->event_idx(event);
}
-static void calc_timer_values(struct perf_event *event,
- u64 *now,
- u64 *enabled,
- u64 *running)
-{
- u64 ctx_time;
-
- *now = perf_clock();
- ctx_time = event->shadow_ctx_time + *now;
- __perf_update_times(event, ctx_time, enabled, running);
-}
-
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
@@ -6353,7 +6400,6 @@ accounting:
ring_buffer_attach(event, rb);
perf_event_update_time(event);
- perf_set_shadow_time(event, event->ctx);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 9888e2bc8c76..52501e5f7655 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -63,7 +63,9 @@ static struct task_struct *watchdog_task;
* Should we dump all CPUs backtraces in a hung task event?
* Defaults to 0, can be changed via sysctl.
*/
-unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
+static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
+#else
+#define sysctl_hung_task_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
/*
@@ -222,11 +224,13 @@ static long hung_timeout_jiffies(unsigned long last_checked,
MAX_SCHEDULE_TIMEOUT;
}
+#ifdef CONFIG_SYSCTL
/*
* Process updating of timeout sysctl
*/
-int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
@@ -241,6 +245,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
return ret;
}
+/*
+ * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
+ * and hung_task_check_interval_secs
+ */
+static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
+static struct ctl_table hung_task_sysctls[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "hung_task_all_cpu_backtrace",
+ .data = &sysctl_hung_task_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP */
+ {
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "hung_task_check_count",
+ .data = &sysctl_hung_task_check_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "hung_task_timeout_secs",
+ .data = &sysctl_hung_task_timeout_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = (void *)&hung_task_timeout_max,
+ },
+ {
+ .procname = "hung_task_check_interval_secs",
+ .data = &sysctl_hung_task_check_interval_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = (void *)&hung_task_timeout_max,
+ },
+ {
+ .procname = "hung_task_warnings",
+ .data = &sysctl_hung_task_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_NEG_ONE,
+ },
+ {}
+};
+
+static void __init hung_task_sysctl_init(void)
+{
+ register_sysctl_init("kernel", hung_task_sysctls);
+}
+#else
+#define hung_task_sysctl_init() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+
static atomic_t reset_hung_task = ATOMIC_INIT(0);
void reset_hung_task_detector(void)
@@ -310,6 +384,7 @@ static int __init hung_task_init(void)
pm_notifier(hungtask_pm_notify, 0);
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+ hung_task_sysctl_init();
return 0;
}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index ee595ec09778..623b8136e9af 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -137,7 +137,7 @@ static inline int irq_select_affinity_usr(unsigned int irq)
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
- unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
+ unsigned int irq = (int)(long)pde_data(file_inode(file));
cpumask_var_t new_value;
int err;
@@ -190,12 +190,12 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
+ return single_open(file, irq_affinity_proc_show, pde_data(inode));
}
static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
+ return single_open(file, irq_affinity_list_proc_show, pde_data(inode));
}
static const struct proc_ops irq_affinity_proc_ops = {
@@ -265,7 +265,7 @@ out:
static int default_affinity_open(struct inode *inode, struct file *file)
{
- return single_open(file, default_affinity_show, PDE_DATA(inode));
+ return single_open(file, default_affinity_show, pde_data(inode));
}
static const struct proc_ops default_affinity_proc_ops = {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 21eccc961bba..94cab8c9ce56 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,6 +48,9 @@
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+#if !defined(CONFIG_OPTPROBES) || !defined(CONFIG_SYSCTL)
+#define kprobe_sysctls_init() do { } while (0)
+#endif
static int kprobes_initialized;
/* kprobe_table can be accessed by
@@ -938,10 +941,10 @@ static void unoptimize_all_kprobes(void)
}
static DEFINE_MUTEX(kprobe_sysctl_mutex);
-int sysctl_kprobes_optimization;
-int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length,
- loff_t *ppos)
+static int sysctl_kprobes_optimization;
+static int proc_kprobes_optimization_handler(struct ctl_table *table,
+ int write, void *buffer,
+ size_t *length, loff_t *ppos)
{
int ret;
@@ -957,6 +960,24 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
return ret;
}
+
+static struct ctl_table kprobe_sysctls[] = {
+ {
+ .procname = "kprobes-optimization",
+ .data = &sysctl_kprobes_optimization,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_kprobes_optimization_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static void __init kprobe_sysctls_init(void)
+{
+ register_sysctl_init("debug", kprobe_sysctls);
+}
#endif /* CONFIG_SYSCTL */
/* Put a breakpoint for a probe. */
@@ -2584,6 +2605,7 @@ static int __init init_kprobes(void)
err = register_module_notifier(&kprobe_module_nb);
kprobes_initialized = (err == 0);
+ kprobe_sysctls_init();
return err;
}
early_initcall(init_kprobes);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index b562f9289372..7f49baaa4979 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -300,6 +300,16 @@ void __lockfunc _raw_write_lock(rwlock_t *lock)
__raw_write_lock(lock);
}
EXPORT_SYMBOL(_raw_write_lock);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock)))
+#endif
+
+void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass)
+{
+ __raw_write_lock_nested(lock, subclass);
+}
+EXPORT_SYMBOL(_raw_write_lock_nested);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index 9e396a09fe0f..48a19ed8486d 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -239,6 +239,18 @@ void __sched rt_write_lock(rwlock_t *rwlock)
}
EXPORT_SYMBOL(rt_write_lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass)
+{
+ rtlock_might_resched();
+ rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock_nested);
+#endif
+
void __sched rt_read_unlock(rwlock_t *rwlock)
{
rwlock_release(&rwlock->dep_map, _RET_IP_);
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index d118739874c0..f5b388e810b9 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -2,5 +2,8 @@
obj-y = printk.o
obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
-obj-$(CONFIG_PRINTK) += printk_ringbuffer.o
obj-$(CONFIG_PRINTK_INDEX) += index.o
+
+obj-$(CONFIG_PRINTK) += printk_support.o
+printk_support-y := printk_ringbuffer.o
+printk_support-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 9f3ed2fdb721..d947ca6c84f9 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -4,6 +4,14 @@
*/
#include <linux/percpu.h>
+#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
+void __init printk_sysctl_init(void);
+int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+#else
+#define printk_sysctl_init() do { } while (0)
+#endif
+
#ifdef CONFIG_PRINTK
/* Flags for a single printk record. */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 155229f0cf0f..82abfaf3c2aa 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -171,7 +171,7 @@ static int __init control_devkmsg(char *str)
__setup("printk.devkmsg=", control_devkmsg);
char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
-
+#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -210,6 +210,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
return 0;
}
+#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */
/* Number of registered extended console drivers. */
static int nr_ext_console_drivers;
@@ -3211,6 +3212,7 @@ static int __init printk_late_init(void)
ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
console_cpu_notify, NULL);
WARN_ON(ret < 0);
+ printk_sysctl_init();
return 0;
}
late_initcall(printk_late_init);
diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c
new file mode 100644
index 000000000000..653ae04aab7f
--- /dev/null
+++ b/kernel/printk/sysctl.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sysctl.c: General linux system control interface
+ */
+
+#include <linux/sysctl.h>
+#include <linux/printk.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include "internal.h"
+
+static const int ten_thousand = 10000;
+
+static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table printk_sysctls[] = {
+ {
+ .procname = "printk",
+ .data = &console_loglevel,
+ .maxlen = 4*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_ratelimit",
+ .data = &printk_ratelimit_state.interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "printk_ratelimit_burst",
+ .data = &printk_ratelimit_state.burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_delay",
+ .data = &printk_delay_msec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&ten_thousand,
+ },
+ {
+ .procname = "printk_devkmsg",
+ .data = devkmsg_log_str,
+ .maxlen = DEVKMSG_STR_MAX_SIZE,
+ .mode = 0644,
+ .proc_handler = devkmsg_sysctl_set_loglvl,
+ },
+ {
+ .procname = "dmesg_restrict",
+ .data = &dmesg_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "kptr_restrict",
+ .data = &kptr_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {}
+};
+
+void __init printk_sysctl_init(void)
+{
+ register_sysctl_init("kernel", printk_sysctls);
+}
diff --git a/kernel/resource.c b/kernel/resource.c
index 5ad3eba619ba..9c08d6e9eef2 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -99,7 +99,7 @@ enum { MAX_IORES_LEVEL = 5 };
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
{
- struct resource *p = PDE_DATA(file_inode(m->file));
+ struct resource *p = pde_data(file_inode(m->file));
loff_t l = 0;
read_lock(&resource_lock);
for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
@@ -115,7 +115,7 @@ static void r_stop(struct seq_file *m, void *v)
static int r_show(struct seq_file *m, void *v)
{
- struct resource *root = PDE_DATA(file_inode(m->file));
+ struct resource *root = pde_data(file_inode(m->file));
struct resource *r = v, *p;
unsigned long long start, end;
int width = root->end < 0x10000 ? 4 : 8;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e4ae00e52d1..848eaa0efe0e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5822,8 +5822,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
if (schedstat_enabled() && rq->core->core_forceidle_count) {
- if (cookie)
- rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_start = rq_clock(rq->core);
rq->core->core_forceidle_occupation = occ;
}
@@ -8219,9 +8218,7 @@ int __cond_resched_lock(spinlock_t *lock)
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
spin_lock(lock);
@@ -8239,9 +8236,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock)
if (rwlock_needbreak(lock) || resched) {
read_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
read_lock(lock);
@@ -8259,9 +8254,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock)
if (rwlock_needbreak(lock) || resched) {
write_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
write_lock(lock);
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 1fb45672ec85..c8746a9a7ada 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -277,7 +277,7 @@ void __sched_core_account_forceidle(struct rq *rq)
rq_i = cpu_rq(i);
p = rq_i->core_pick ?: rq_i->curr;
- if (!p->core_cookie)
+ if (p == rq_i->idle)
continue;
__schedstat_add(p->stats.core_forceidle_sum, delta);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 095b0aa378df..5146163bfabb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3028,9 +3028,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u32 divider = get_pelt_divider(&se->avg);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
#else
static inline void
@@ -3381,7 +3383,6 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
-
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
@@ -3449,15 +3450,14 @@ void set_task_rq_fair(struct sched_entity *se,
* XXX: only do this for the part of runnable > running ?
*
*/
-
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
- u32 divider;
+ long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ u32 new_sum, divider;
/* Nothing to update */
- if (!delta)
+ if (!delta_avg)
return;
/*
@@ -3466,23 +3466,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * divider;
+ new_sum = se->avg.util_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.util_sum;
+ se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ add_positive(&cfs_rq->avg.util_avg, delta_avg);
+ add_positive(&cfs_rq->avg.util_sum, delta_sum);
+
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
- u32 divider;
+ long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ u32 new_sum, divider;
/* Nothing to update */
- if (!delta)
+ if (!delta_avg)
return;
/*
@@ -3493,19 +3500,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
- se->avg.runnable_sum = se->avg.runnable_avg * divider;
+ new_sum = se->avg.runnable_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
+ se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg;
u64 load_sum = 0;
+ s64 delta_sum;
u32 divider;
if (!runnable_sum)
@@ -3532,7 +3545,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* assuming all tasks are equally runnable.
*/
if (scale_load_down(gcfs_rq->load.weight)) {
- load_sum = div_s64(gcfs_rq->avg.load_sum,
+ load_sum = div_u64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight));
}
@@ -3549,19 +3562,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
- load_sum = (s64)se_weight(se) * runnable_sum;
- load_avg = div_s64(load_sum, divider);
-
- se->avg.load_sum = runnable_sum;
+ load_sum = se_weight(se) * runnable_sum;
+ load_avg = div_u64(load_sum, divider);
- delta = load_avg - se->avg.load_avg;
- if (!delta)
+ delta_avg = load_avg - se->avg.load_avg;
+ if (!delta_avg)
return;
- se->avg.load_avg = load_avg;
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
- add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+ se->avg.load_sum = runnable_sum;
+ se->avg.load_avg = load_avg;
+ add_positive(&cfs_rq->avg.load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.load_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3652,7 +3668,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
- * Returns true if the load decayed or we removed load.
+ * Return: true if the load decayed or we removed load.
*
* Since both these conditions indicate a changed cfs_rq->avg.load we should
* call update_tg_load_avg() when this function returns true.
@@ -3677,15 +3693,32 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_load;
sub_positive(&sa->load_avg, r);
- sa->load_sum = sa->load_avg * divider;
+ sub_positive(&sa->load_sum, r * divider);
+ /* See sa->util_sum below */
+ sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
r = removed_util;
sub_positive(&sa->util_avg, r);
- sa->util_sum = sa->util_avg * divider;
+ sub_positive(&sa->util_sum, r * divider);
+ /*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
- sa->runnable_sum = sa->runnable_avg * divider;
+ sub_positive(&sa->runnable_sum, r * divider);
+ /* See sa->util_sum above */
+ sa->runnable_sum = max_t(u32, sa->runnable_sum,
+ sa->runnable_avg * PELT_MIN_DIVIDER);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -3772,17 +3805,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- /*
- * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
- * See ___update_load_avg() for details.
- */
- u32 divider = get_pelt_divider(&cfs_rq->avg);
-
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -8539,6 +8573,8 @@ group_type group_classify(unsigned int imbalance_pct,
*
* If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
* of @dst_cpu are idle and @sg has lower priority.
+ *
+ * Return: true if @dst_cpu can pull tasks, false otherwise.
*/
static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
struct sg_lb_stats *sgs,
@@ -8614,6 +8650,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
+ * @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_status: Holds flag indicating the status of the sched_group
@@ -9421,12 +9458,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
+ * @env: The load balancing environment.
*
* Also calculates the amount of runnable load which should be moved
* to restore balance.
*
- * @env: The load balancing environment.
- *
* Return: - The busiest group if imbalance exists.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index e06071bf3472..c336f5f481bc 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -37,9 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running)
}
#endif
+#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024)
+
static inline u32 get_pelt_divider(struct sched_avg *avg)
{
- return LOAD_AVG_MAX - 1024 + avg->period_contrib;
+ return PELT_MIN_DIVIDER + avg->period_contrib;
}
static inline void cfs_se_util_change(struct sched_avg *avg)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index a679613a7cb7..c137c4d6983e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1162,7 +1162,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->event = 0;
t->last_event_time = 0;
init_waitqueue_head(&t->event_wait);
- kref_init(&t->refcount);
mutex_lock(&group->trigger_lock);
@@ -1191,15 +1190,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
return t;
}
-static void psi_trigger_destroy(struct kref *ref)
+void psi_trigger_destroy(struct psi_trigger *t)
{
- struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
- struct psi_group *group = t->group;
+ struct psi_group *group;
struct task_struct *task_to_destroy = NULL;
- if (static_branch_likely(&psi_disabled))
+ /*
+ * We do not check psi_disabled since it might have been disabled after
+ * the trigger got created.
+ */
+ if (!t)
return;
+ group = t->group;
/*
* Wakeup waiters to stop polling. Can happen if cgroup is deleted
* from under a polling process.
@@ -1235,9 +1238,9 @@ static void psi_trigger_destroy(struct kref *ref)
mutex_unlock(&group->trigger_lock);
/*
- * Wait for both *trigger_ptr from psi_trigger_replace and
- * poll_task RCUs to complete their read-side critical sections
- * before destroying the trigger and optionally the poll_task
+ * Wait for psi_schedule_poll_work RCU to complete its read-side
+ * critical section before destroying the trigger and optionally the
+ * poll_task.
*/
synchronize_rcu();
/*
@@ -1254,18 +1257,6 @@ static void psi_trigger_destroy(struct kref *ref)
kfree(t);
}
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
-{
- struct psi_trigger *old = *trigger_ptr;
-
- if (static_branch_likely(&psi_disabled))
- return;
-
- rcu_assign_pointer(*trigger_ptr, new);
- if (old)
- kref_put(&old->refcount, psi_trigger_destroy);
-}
-
__poll_t psi_trigger_poll(void **trigger_ptr,
struct file *file, poll_table *wait)
{
@@ -1275,24 +1266,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
if (static_branch_likely(&psi_disabled))
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- rcu_read_lock();
-
- t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
- if (!t) {
- rcu_read_unlock();
+ t = smp_load_acquire(trigger_ptr);
+ if (!t)
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- }
- kref_get(&t->refcount);
-
- rcu_read_unlock();
poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1)
ret |= EPOLLPRI;
- kref_put(&t->refcount, psi_trigger_destroy);
-
return ret;
}
@@ -1316,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
buf[buf_size - 1] = '\0';
- new = psi_trigger_create(&psi_system, buf, nbytes, res);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
seq = file->private_data;
+
/* Take seq->lock to protect seq->private from concurrent writes */
mutex_lock(&seq->lock);
- psi_trigger_replace(&seq->private, new);
+
+ /* Allow only one trigger per file descriptor */
+ if (seq->private) {
+ mutex_unlock(&seq->lock);
+ return -EBUSY;
+ }
+
+ new = psi_trigger_create(&psi_system, buf, nbytes, res);
+ if (IS_ERR(new)) {
+ mutex_unlock(&seq->lock);
+ return PTR_ERR(new);
+ }
+
+ smp_store_release(&seq->private, new);
mutex_unlock(&seq->lock);
return nbytes;
@@ -1358,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
- psi_trigger_replace(&seq->private, NULL);
+ psi_trigger_destroy(seq->private);
return single_release(inode, file);
}
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index ce161a8e8d97..66b8af394e58 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -16,11 +16,13 @@
#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
#include <linux/jump_label.h>
#include <linux/sysctl.h>
+#include <linux/init.h>
static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);
-int stack_erasing_sysctl(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+#ifdef CONFIG_SYSCTL
+static int stack_erasing_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
int ret = 0;
int state = !static_branch_unlikely(&stack_erasing_bypass);
@@ -42,6 +44,26 @@ int stack_erasing_sysctl(struct ctl_table *table, int write,
state ? "enabled" : "disabled");
return ret;
}
+static struct ctl_table stackleak_sysctls[] = {
+ {
+ .procname = "stack_erasing",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = stack_erasing_sysctl,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static int __init stackleak_sysctls_init(void)
+{
+ register_sysctl_init("kernel", stackleak_sysctls);
+ return 0;
+}
+late_initcall(stackleak_sysctls_init);
+#endif /* CONFIG_SYSCTL */
#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass)
#else
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ef77be575d87..5ae443b2882e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -20,7 +20,6 @@
*/
#include <linux/module.h>
-#include <linux/aio.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
@@ -50,7 +49,6 @@
#include <linux/times.h>
#include <linux/limits.h>
#include <linux/dcache.h>
-#include <linux/dnotify.h>
#include <linux/syscalls.h>
#include <linux/vmstat.h>
#include <linux/nfs_fs.h>
@@ -58,19 +56,15 @@
#include <linux/reboot.h>
#include <linux/ftrace.h>
#include <linux/perf_event.h>
-#include <linux/kprobes.h>
-#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/kmod.h>
#include <linux/capability.h>
#include <linux/binfmts.h>
#include <linux/sched/sysctl.h>
-#include <linux/sched/coredump.h>
#include <linux/kexec.h>
#include <linux/bpf.h>
#include <linux/mount.h>
#include <linux/userfaultfd_k.h>
-#include <linux/coredump.h>
#include <linux/latencytop.h>
#include <linux/pid.h>
#include <linux/delayacct.h>
@@ -97,65 +91,21 @@
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
#include <linux/lockdep.h>
#endif
-#ifdef CONFIG_CHR_DEV_SG
-#include <scsi/sg.h>
-#endif
-#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
-#include <linux/stackleak.h>
-#endif
-#ifdef CONFIG_LOCKUP_DETECTOR
-#include <linux/nmi.h>
-#endif
#if defined(CONFIG_SYSCTL)
/* Constants used for minimum and maximum */
-#ifdef CONFIG_LOCKUP_DETECTOR
-static int sixty = 60;
-#endif
-
-static int __maybe_unused neg_one = -1;
-static int __maybe_unused two = 2;
-static int __maybe_unused four = 4;
-static unsigned long zero_ul;
-static unsigned long one_ul = 1;
-static unsigned long long_max = LONG_MAX;
-static int one_hundred = 100;
-static int two_hundred = 200;
-static int one_thousand = 1000;
-static int three_thousand = 3000;
-#ifdef CONFIG_PRINTK
-static int ten_thousand = 10000;
-#endif
+
#ifdef CONFIG_PERF_EVENTS
-static int six_hundred_forty_kb = 640 * 1024;
+static const int six_hundred_forty_kb = 640 * 1024;
#endif
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
-static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
-
-/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
-static int maxolduid = 65535;
-static int minolduid;
+static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
-static int ngroups_max = NGROUPS_MAX;
+static const int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
-/*
- * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
- * and hung_task_check_interval_secs
- */
-#ifdef CONFIG_DETECT_HUNG_TASK
-static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
-#endif
-
-#ifdef CONFIG_INOTIFY_USER
-#include <linux/inotify.h>
-#endif
-#ifdef CONFIG_FANOTIFY
-#include <linux/fanotify.h>
-#endif
-
#ifdef CONFIG_PROC_SYSCTL
/**
@@ -192,8 +142,8 @@ int sysctl_legacy_va_layout;
#endif
#ifdef CONFIG_COMPACTION
-static int min_extfrag_threshold;
-static int max_extfrag_threshold = 1000;
+/* min_extfrag_threshold is SYSCTL_ZERO */;
+static const int max_extfrag_threshold = 1000;
#endif
#endif /* CONFIG_SYSCTL */
@@ -804,12 +754,12 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
}
-static int do_proc_douintvec(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
+int do_proc_douintvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
{
return __do_proc_douintvec(table->data, table, write,
buffer, lenp, ppos, conv, data);
@@ -938,17 +888,6 @@ static int proc_taint(struct ctl_table *table, int write,
return err;
}
-#ifdef CONFIG_PRINTK
-static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-}
-#endif
-
/**
* struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
* @min: pointer to minimum allowable value
@@ -1144,67 +1083,6 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write,
}
EXPORT_SYMBOL_GPL(proc_dou8vec_minmax);
-static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- if (write) {
- unsigned int val;
-
- val = round_pipe_size(*lvalp);
- if (val == 0)
- return -EINVAL;
-
- *valp = val;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long) val;
- }
-
- return 0;
-}
-
-static int proc_dopipe_max_size(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_dopipe_max_size_conv, NULL);
-}
-
-static void validate_coredump_safety(void)
-{
-#ifdef CONFIG_COREDUMP
- if (suid_dumpable == SUID_DUMP_ROOT &&
- core_pattern[0] != '/' && core_pattern[0] != '|') {
- printk(KERN_WARNING
-"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
-"Pipe handler or fully qualified core dump path required.\n"
-"Set kernel.core_pattern before fs.suid_dumpable.\n"
- );
- }
-#endif
-}
-
-static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
-}
-
-#ifdef CONFIG_COREDUMP
-static int proc_dostring_coredump(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dostring(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
-}
-#endif
-
#ifdef CONFIG_MAGIC_SYSRQ
static int sysrq_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
@@ -1267,10 +1145,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
err = proc_get_long(&p, &left, &val, &neg,
proc_wspace_sep,
sizeof(proc_wspace_sep), NULL);
- if (err)
+ if (err || neg) {
+ err = -EINVAL;
break;
- if (neg)
- continue;
+ }
+
val = convmul * val / convdiv;
if ((min && val < *min) || (max && val > *max)) {
err = -EINVAL;
@@ -1928,29 +1807,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#ifdef CONFIG_COREDUMP
- {
- .procname = "core_uses_pid",
- .data = &core_uses_pid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "core_pattern",
- .data = core_pattern,
- .maxlen = CORENAME_MAX_SIZE,
- .mode = 0644,
- .proc_handler = proc_dostring_coredump,
- },
- {
- .procname = "core_pipe_limit",
- .data = &core_pipe_limit,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#ifdef CONFIG_PROC_SYSCTL
{
.procname = "tainted",
@@ -1964,7 +1820,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &neg_one,
+ .extra1 = SYSCTL_NEG_ONE,
.extra2 = SYSCTL_ONE,
},
#endif
@@ -2131,15 +1987,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dostring,
},
#endif
-#ifdef CONFIG_CHR_DEV_SG
- {
- .procname = "sg-big-buff",
- .data = &sg_big_buff,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = proc_dointvec,
- },
-#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
{
.procname = "acct",
@@ -2175,30 +2022,18 @@ static struct ctl_table kern_table[] = {
.proc_handler = sysctl_max_threads,
},
{
- .procname = "random",
- .mode = 0555,
- .child = random_table,
- },
- {
.procname = "usermodehelper",
.mode = 0555,
.child = usermodehelper_table,
},
-#ifdef CONFIG_FW_LOADER_USER_HELPER
- {
- .procname = "firmware_config",
- .mode = 0555,
- .child = firmware_config_table,
- },
-#endif
{
.procname = "overflowuid",
.data = &overflowuid,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &minolduid,
- .extra2 = &maxolduid,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
},
{
.procname = "overflowgid",
@@ -2206,8 +2041,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &minolduid,
- .extra2 = &maxolduid,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
},
#ifdef CONFIG_S390
{
@@ -2252,66 +2087,9 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
-#if defined CONFIG_PRINTK
- {
- .procname = "printk",
- .data = &console_loglevel,
- .maxlen = 4*sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "printk_ratelimit",
- .data = &printk_ratelimit_state.interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "printk_ratelimit_burst",
- .data = &printk_ratelimit_state.burst,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "printk_delay",
- .data = &printk_delay_msec,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &ten_thousand,
- },
- {
- .procname = "printk_devkmsg",
- .data = devkmsg_log_str,
- .maxlen = DEVKMSG_STR_MAX_SIZE,
- .mode = 0644,
- .proc_handler = devkmsg_sysctl_set_loglvl,
- },
- {
- .procname = "dmesg_restrict",
- .data = &dmesg_restrict,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax_sysadmin,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "kptr_restrict",
- .data = &kptr_restrict,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax_sysadmin,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &two,
- },
-#endif
{
.procname = "ngroups_max",
- .data = &ngroups_max,
+ .data = (void *)&ngroups_max,
.maxlen = sizeof (int),
.mode = 0444,
.proc_handler = proc_dointvec,
@@ -2323,96 +2101,6 @@ static struct ctl_table kern_table[] = {
.mode = 0444,
.proc_handler = proc_dointvec,
},
-#if defined(CONFIG_LOCKUP_DETECTOR)
- {
- .procname = "watchdog",
- .data = &watchdog_user_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_watchdog,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "watchdog_thresh",
- .data = &watchdog_thresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_watchdog_thresh,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &sixty,
- },
- {
- .procname = "nmi_watchdog",
- .data = &nmi_watchdog_user_enabled,
- .maxlen = sizeof(int),
- .mode = NMI_WATCHDOG_SYSCTL_PERM,
- .proc_handler = proc_nmi_watchdog,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "watchdog_cpumask",
- .data = &watchdog_cpumask_bits,
- .maxlen = NR_CPUS,
- .mode = 0644,
- .proc_handler = proc_watchdog_cpumask,
- },
-#ifdef CONFIG_SOFTLOCKUP_DETECTOR
- {
- .procname = "soft_watchdog",
- .data = &soft_watchdog_user_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_soft_watchdog,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "softlockup_panic",
- .data = &softlockup_panic,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#ifdef CONFIG_SMP
- {
- .procname = "softlockup_all_cpu_backtrace",
- .data = &sysctl_softlockup_all_cpu_backtrace,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_SMP */
-#endif
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
- {
- .procname = "hardlockup_panic",
- .data = &hardlockup_panic,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#ifdef CONFIG_SMP
- {
- .procname = "hardlockup_all_cpu_backtrace",
- .data = &sysctl_hardlockup_all_cpu_backtrace,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_SMP */
-#endif
-#endif
-
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
.procname = "unknown_nmi_panic",
@@ -2515,60 +2203,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_DETECT_HUNG_TASK
-#ifdef CONFIG_SMP
- {
- .procname = "hung_task_all_cpu_backtrace",
- .data = &sysctl_hung_task_all_cpu_backtrace,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_SMP */
- {
- .procname = "hung_task_panic",
- .data = &sysctl_hung_task_panic,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "hung_task_check_count",
- .data = &sysctl_hung_task_check_count,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "hung_task_timeout_secs",
- .data = &sysctl_hung_task_timeout_secs,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_dohung_task_timeout_secs,
- .extra2 = &hung_task_timeout_max,
- },
- {
- .procname = "hung_task_check_interval_secs",
- .data = &sysctl_hung_task_check_interval_secs,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_dohung_task_timeout_secs,
- .extra2 = &hung_task_timeout_max,
- },
- {
- .procname = "hung_task_warnings",
- .data = &sysctl_hung_task_warnings,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &neg_one,
- },
-#endif
#ifdef CONFIG_RT_MUTEXES
{
.procname = "max_lock_depth",
@@ -2628,7 +2262,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = perf_cpu_time_max_percent_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
+ .extra2 = SYSCTL_ONE_HUNDRED,
},
{
.procname = "perf_event_max_stack",
@@ -2637,7 +2271,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = perf_event_max_stack_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &six_hundred_forty_kb,
+ .extra2 = (void *)&six_hundred_forty_kb,
},
{
.procname = "perf_event_max_contexts_per_stack",
@@ -2646,7 +2280,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = perf_event_max_stack_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &one_thousand,
+ .extra2 = SYSCTL_ONE_THOUSAND,
},
#endif
{
@@ -2677,7 +2311,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = bpf_unpriv_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &two,
+ .extra2 = SYSCTL_TWO,
},
{
.procname = "bpf_stats_enabled",
@@ -2709,17 +2343,6 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_INT_MAX,
},
#endif
-#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
- {
- .procname = "stack_erasing",
- .data = NULL,
- .maxlen = sizeof(int),
- .mode = 0600,
- .proc_handler = stack_erasing_sysctl,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
{ }
};
@@ -2731,7 +2354,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = overcommit_policy_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &two,
+ .extra2 = SYSCTL_TWO,
},
{
.procname = "panic_on_oom",
@@ -2740,7 +2363,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &two,
+ .extra2 = SYSCTL_TWO,
},
{
.procname = "oom_kill_allocating_task",
@@ -2785,7 +2408,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = dirty_background_ratio_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
+ .extra2 = SYSCTL_ONE_HUNDRED,
},
{
.procname = "dirty_background_bytes",
@@ -2793,7 +2416,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(dirty_background_bytes),
.mode = 0644,
.proc_handler = dirty_background_bytes_handler,
- .extra1 = &one_ul,
+ .extra1 = SYSCTL_LONG_ONE,
},
{
.procname = "dirty_ratio",
@@ -2802,7 +2425,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = dirty_ratio_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
+ .extra2 = SYSCTL_ONE_HUNDRED,
},
{
.procname = "dirty_bytes",
@@ -2810,7 +2433,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(vm_dirty_bytes),
.mode = 0644,
.proc_handler = dirty_bytes_handler,
- .extra1 = &dirty_bytes_min,
+ .extra1 = (void *)&dirty_bytes_min,
},
{
.procname = "dirty_writeback_centisecs",
@@ -2842,7 +2465,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &two_hundred,
+ .extra2 = SYSCTL_TWO_HUNDRED,
},
#ifdef CONFIG_HUGETLB_PAGE
{
@@ -2899,7 +2522,7 @@ static struct ctl_table vm_table[] = {
.mode = 0200,
.proc_handler = drop_caches_sysctl_handler,
.extra1 = SYSCTL_ONE,
- .extra2 = &four,
+ .extra2 = SYSCTL_FOUR,
},
#ifdef CONFIG_COMPACTION
{
@@ -2916,7 +2539,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = compaction_proactiveness_sysctl_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
+ .extra2 = SYSCTL_ONE_HUNDRED,
},
{
.procname = "extfrag_threshold",
@@ -2924,8 +2547,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &min_extfrag_threshold,
- .extra2 = &max_extfrag_threshold,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&max_extfrag_threshold,
},
{
.procname = "compact_unevictable_allowed",
@@ -2961,7 +2584,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = watermark_scale_factor_sysctl_handler,
.extra1 = SYSCTL_ONE,
- .extra2 = &three_thousand,
+ .extra2 = SYSCTL_THREE_THOUSAND,
},
{
.procname = "percpu_pagelist_high_fraction",
@@ -3040,7 +2663,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
+ .extra2 = SYSCTL_ONE_HUNDRED,
},
{
.procname = "min_slab_ratio",
@@ -3049,7 +2672,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = sysctl_min_slab_ratio_sysctl_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
+ .extra2 = SYSCTL_ONE_HUNDRED,
},
#endif
#ifdef CONFIG_SMP
@@ -3183,221 +2806,6 @@ static struct ctl_table vm_table[] = {
{ }
};
-static struct ctl_table fs_table[] = {
- {
- .procname = "inode-nr",
- .data = &inodes_stat,
- .maxlen = 2*sizeof(long),
- .mode = 0444,
- .proc_handler = proc_nr_inodes,
- },
- {
- .procname = "inode-state",
- .data = &inodes_stat,
- .maxlen = 7*sizeof(long),
- .mode = 0444,
- .proc_handler = proc_nr_inodes,
- },
- {
- .procname = "file-nr",
- .data = &files_stat,
- .maxlen = sizeof(files_stat),
- .mode = 0444,
- .proc_handler = proc_nr_files,
- },
- {
- .procname = "file-max",
- .data = &files_stat.max_files,
- .maxlen = sizeof(files_stat.max_files),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &zero_ul,
- .extra2 = &long_max,
- },
- {
- .procname = "nr_open",
- .data = &sysctl_nr_open,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &sysctl_nr_open_min,
- .extra2 = &sysctl_nr_open_max,
- },
- {
- .procname = "dentry-state",
- .data = &dentry_stat,
- .maxlen = 6*sizeof(long),
- .mode = 0444,
- .proc_handler = proc_nr_dentry,
- },
- {
- .procname = "overflowuid",
- .data = &fs_overflowuid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &minolduid,
- .extra2 = &maxolduid,
- },
- {
- .procname = "overflowgid",
- .data = &fs_overflowgid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &minolduid,
- .extra2 = &maxolduid,
- },
-#ifdef CONFIG_FILE_LOCKING
- {
- .procname = "leases-enable",
- .data = &leases_enable,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_DNOTIFY
- {
- .procname = "dir-notify-enable",
- .data = &dir_notify_enable,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_MMU
-#ifdef CONFIG_FILE_LOCKING
- {
- .procname = "lease-break-time",
- .data = &lease_break_time,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_AIO
- {
- .procname = "aio-nr",
- .data = &aio_nr,
- .maxlen = sizeof(aio_nr),
- .mode = 0444,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
- .procname = "aio-max-nr",
- .data = &aio_max_nr,
- .maxlen = sizeof(aio_max_nr),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
-#endif /* CONFIG_AIO */
-#ifdef CONFIG_INOTIFY_USER
- {
- .procname = "inotify",
- .mode = 0555,
- .child = inotify_table,
- },
-#endif
-#ifdef CONFIG_FANOTIFY
- {
- .procname = "fanotify",
- .mode = 0555,
- .child = fanotify_table,
- },
-#endif
-#ifdef CONFIG_EPOLL
- {
- .procname = "epoll",
- .mode = 0555,
- .child = epoll_table,
- },
-#endif
-#endif
- {
- .procname = "protected_symlinks",
- .data = &sysctl_protected_symlinks,
- .maxlen = sizeof(int),
- .mode = 0600,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "protected_hardlinks",
- .data = &sysctl_protected_hardlinks,
- .maxlen = sizeof(int),
- .mode = 0600,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "protected_fifos",
- .data = &sysctl_protected_fifos,
- .maxlen = sizeof(int),
- .mode = 0600,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &two,
- },
- {
- .procname = "protected_regular",
- .data = &sysctl_protected_regular,
- .maxlen = sizeof(int),
- .mode = 0600,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &two,
- },
- {
- .procname = "suid_dumpable",
- .data = &suid_dumpable,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax_coredump,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &two,
- },
-#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
- {
- .procname = "binfmt_misc",
- .mode = 0555,
- .child = sysctl_mount_point,
- },
-#endif
- {
- .procname = "pipe-max-size",
- .data = &pipe_max_size,
- .maxlen = sizeof(pipe_max_size),
- .mode = 0644,
- .proc_handler = proc_dopipe_max_size,
- },
- {
- .procname = "pipe-user-pages-hard",
- .data = &pipe_user_pages_hard,
- .maxlen = sizeof(pipe_user_pages_hard),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
- .procname = "pipe-user-pages-soft",
- .data = &pipe_user_pages_soft,
- .maxlen = sizeof(pipe_user_pages_soft),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
- .procname = "mount-max",
- .data = &sysctl_mount_max,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- },
- { }
-};
-
static struct ctl_table debug_table[] = {
#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
{
@@ -3408,17 +2816,6 @@ static struct ctl_table debug_table[] = {
.proc_handler = proc_dointvec
},
#endif
-#if defined(CONFIG_OPTPROBES)
- {
- .procname = "kprobes-optimization",
- .data = &sysctl_kprobes_optimization,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_kprobes_optimization_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
{ }
};
@@ -3426,41 +2823,18 @@ static struct ctl_table dev_table[] = {
{ }
};
-static struct ctl_table sysctl_base_table[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = kern_table,
- },
- {
- .procname = "vm",
- .mode = 0555,
- .child = vm_table,
- },
- {
- .procname = "fs",
- .mode = 0555,
- .child = fs_table,
- },
- {
- .procname = "debug",
- .mode = 0555,
- .child = debug_table,
- },
- {
- .procname = "dev",
- .mode = 0555,
- .child = dev_table,
- },
- { }
-};
+DECLARE_SYSCTL_BASE(kernel, kern_table);
+DECLARE_SYSCTL_BASE(vm, vm_table);
+DECLARE_SYSCTL_BASE(debug, debug_table);
+DECLARE_SYSCTL_BASE(dev, dev_table);
-int __init sysctl_init(void)
+int __init sysctl_init_bases(void)
{
- struct ctl_table_header *hdr;
+ register_sysctl_base(kernel);
+ register_sysctl_base(vm);
+ register_sysctl_base(debug);
+ register_sysctl_base(dev);
- hdr = register_sysctl_table(sysctl_base_table);
- kmemleak_not_leak(hdr);
return 0;
}
#endif /* CONFIG_SYSCTL */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b7e52a642948..1cf73807b450 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -285,7 +285,7 @@ static void clocksource_verify_choose_cpus(void)
return;
/* Make sure to select at least one CPU other than the current CPU. */
- cpu = cpumask_next(-1, cpu_online_mask);
+ cpu = cpumask_first(cpu_online_mask);
if (cpu == smp_processor_id())
cpu = cpumask_next(cpu, cpu_online_mask);
if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
@@ -307,7 +307,7 @@ static void clocksource_verify_choose_cpus(void)
cpu = prandom_u32() % nr_cpu_ids;
cpu = cpumask_next(cpu - 1, cpu_online_mask);
if (cpu >= nr_cpu_ids)
- cpu = cpumask_next(-1, cpu_online_mask);
+ cpu = cpumask_first(cpu_online_mask);
if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
cpumask_set_cpu(cpu, &cpus_chosen);
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f468767bc287..752ed89a293b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -70,6 +70,13 @@ config HAVE_C_RECORDMCOUNT
help
C version of recordmcount available?
+config BUILDTIME_MCOUNT_SORT
+ bool
+ default y
+ depends on BUILDTIME_TABLE_SORT && !S390
+ help
+ Sort the mcount_loc section at build time.
+
config TRACER_MAX_TRACE
bool
@@ -918,7 +925,7 @@ config EVENT_TRACE_TEST_SYSCALLS
config FTRACE_SORT_STARTUP_TEST
bool "Verify compile time sorting of ftrace functions"
depends on DYNAMIC_FTRACE
- depends on BUILDTIME_TABLE_SORT
+ depends on BUILDTIME_MCOUNT_SORT
help
Sorting of the mcount_loc sections that is used to find the
where the ftrace knows where to patch functions for tracing
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6163b6f762f7..f9feb197b2da 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6435,10 +6435,10 @@ static int ftrace_process_locs(struct module *mod,
/*
* Sorting mcount in vmlinux at build time depend on
- * CONFIG_BUILDTIME_TABLE_SORT, while mcount loc in
+ * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in
* modules can not be sorted at build time.
*/
- if (!IS_ENABLED(CONFIG_BUILDTIME_TABLE_SORT) || mod) {
+ if (!IS_ENABLED(CONFIG_BUILDTIME_MCOUNT_SORT) || mod) {
sort(start, count, sizeof(*start),
ftrace_cmp_ips, NULL);
} else {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index ad912511a0c0..99afb88d2e85 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -740,6 +740,106 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
mutex_unlock(&watchdog_mutex);
return err;
}
+
+static const int sixty = 60;
+
+static struct ctl_table watchdog_sysctls[] = {
+ {
+ .procname = "watchdog",
+ .data = &watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_watchdog,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "watchdog_thresh",
+ .data = &watchdog_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_watchdog_thresh,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&sixty,
+ },
+ {
+ .procname = "nmi_watchdog",
+ .data = &nmi_watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = NMI_WATCHDOG_SYSCTL_PERM,
+ .proc_handler = proc_nmi_watchdog,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "watchdog_cpumask",
+ .data = &watchdog_cpumask_bits,
+ .maxlen = NR_CPUS,
+ .mode = 0644,
+ .proc_handler = proc_watchdog_cpumask,
+ },
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+ {
+ .procname = "soft_watchdog",
+ .data = &soft_watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_soft_watchdog,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "softlockup_panic",
+ .data = &softlockup_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#ifdef CONFIG_SMP
+ {
+ .procname = "softlockup_all_cpu_backtrace",
+ .data = &sysctl_softlockup_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP */
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+ {
+ .procname = "hardlockup_panic",
+ .data = &hardlockup_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#ifdef CONFIG_SMP
+ {
+ .procname = "hardlockup_all_cpu_backtrace",
+ .data = &sysctl_hardlockup_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP */
+#endif
+ {}
+};
+
+static void __init watchdog_sysctl_init(void)
+{
+ register_sysctl_init("kernel", watchdog_sysctls);
+}
+#else
+#define watchdog_sysctl_init() do { } while (0)
#endif /* CONFIG_SYSCTL */
void __init lockup_detector_init(void)
@@ -753,4 +853,5 @@ void __init lockup_detector_init(void)
if (!watchdog_nmi_probe())
nmi_watchdog_available = true;
lockup_detector_setup();
+ watchdog_sysctl_init();
}