summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cgroup/cgroup-v1.c14
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--kernel/cgroup/rstat.c195
-rw-r--r--kernel/events/uprobes.c5
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/kexec_handover.c4
-rw-r--r--kernel/padata.c154
-rw-r--r--kernel/power/hibernate.c26
-rw-r--r--kernel/rcu/tree_stall.h7
-rw-r--r--kernel/sched/core.c29
-rw-r--r--kernel/sched/ext.c250
-rw-r--r--kernel/sched/ext.h20
-rw-r--r--kernel/sched/ext_idle.c45
-rw-r--r--kernel/sched/ext_idle.h12
-rw-r--r--kernel/sched/sched.h9
-rw-r--r--kernel/sys.c50
-rw-r--r--kernel/time/clocksource.c5
-rw-r--r--kernel/trace/Kconfig27
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/preemptirq_delay_test.c13
-rw-r--r--kernel/trace/rv/rv.c6
-rw-r--r--kernel/trace/trace.c49
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events.c154
-rw-r--r--kernel/trace/trace_events_filter.c28
-rw-r--r--kernel/trace/trace_hwlat.c5
-rw-r--r--kernel/unwind/Makefile1
-rw-r--r--kernel/unwind/deferred.c362
-rw-r--r--kernel/unwind/user.c128
-rw-r--r--kernel/vhost_task.c2
-rw-r--r--kernel/watchdog_buddy.c5
-rw-r--r--kernel/workqueue.c74
35 files changed, 1097 insertions, 605 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index a89602a2a0ab..366987d9914a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -54,6 +54,7 @@ obj-y += rcu/
obj-y += livepatch/
obj-y += dma/
obj-y += entry/
+obj-y += unwind/
obj-$(CONFIG_MODULES) += module/
obj-$(CONFIG_KCMP) += kcmp.o
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index fa24c032ed6f..2a4a387f867a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -32,6 +32,9 @@ static u16 cgroup_no_v1_mask;
/* disable named v1 mounts */
static bool cgroup_no_v1_named;
+/* Show unavailable controllers in /proc/cgroups */
+static bool proc_show_all;
+
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
@@ -683,10 +686,11 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
*/
for_each_subsys(ss, i) {
- if (cgroup1_subsys_absent(ss))
- continue;
cgrp_v1_visible |= ss->root != &cgrp_dfl_root;
+ if (!proc_show_all && cgroup1_subsys_absent(ss))
+ continue;
+
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps),
@@ -1359,3 +1363,9 @@ static int __init cgroup_no_v1(char *str)
return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);
+
+static int __init cgroup_v1_proc(char *str)
+{
+ return (kstrtobool(str, &proc_show_all) == 0);
+}
+__setup("cgroup_v1_proc=", cgroup_v1_proc);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 3bc4301466f3..f74d04429a29 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4051,7 +4051,7 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
+ hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index cbeaa499a96a..981e2f77ad4e 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -10,7 +10,7 @@
#include <trace/events/cgroup.h>
static DEFINE_SPINLOCK(rstat_base_lock);
-static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -45,84 +45,11 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
return &rstat_base_lock;
}
-static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
+static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
{
- if (ss) {
- /*
- * Depending on config, the subsystem per-cpu lock type may be an
- * empty struct. In enviromnents where this is the case, allocation
- * of this field is not performed in ss_rstat_init(). Avoid a
- * cpu-based offset relative to NULL by returning early. When the
- * lock type is zero in size, the corresponding lock functions are
- * no-ops so passing them NULL is acceptable.
- */
- if (sizeof(*ss->rstat_ss_cpu_lock) == 0)
- return NULL;
-
- return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu);
- }
-
- return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
-}
-
-/*
- * Helper functions for rstat per CPU locks.
- *
- * This makes it easier to diagnose locking issues and contention in
- * production environments. The parameter @fast_path determine the
- * tracepoints being added, allowing us to diagnose "flush" related
- * operations without handling high-frequency fast-path "update" events.
- */
-static __always_inline
-unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
- const bool fast_path)
-{
- struct cgroup *cgrp = css->cgroup;
- raw_spinlock_t *cpu_lock;
- unsigned long flags;
- bool contended;
-
- /*
- * The _irqsave() is needed because the locks used for flushing are
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
- * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
- * kernel. The raw_spinlock_t below disables interrupts on both
- * configurations. The _irqsave() ensures that interrupts are always
- * disabled and later restored.
- */
- cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
- contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
- if (contended) {
- if (fast_path)
- trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
- else
- trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
-
- raw_spin_lock_irqsave(cpu_lock, flags);
- }
-
- if (fast_path)
- trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
- else
- trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
-
- return flags;
-}
-
-static __always_inline
-void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
- unsigned long flags, const bool fast_path)
-{
- struct cgroup *cgrp = css->cgroup;
- raw_spinlock_t *cpu_lock;
-
- if (fast_path)
- trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
- else
- trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
-
- cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
- raw_spin_unlock_irqrestore(cpu_lock, flags);
+ if (ss)
+ return per_cpu_ptr(ss->lhead, cpu);
+ return per_cpu_ptr(&rstat_backlog_list, cpu);
}
/**
@@ -130,13 +57,22 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
* @css: target cgroup subsystem state
* @cpu: cpu on which rstat_cpu was updated
*
- * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * css_rstat_cpu definition for details.
+ * Atomically inserts the css in the ss's llist for the given cpu. This is
+ * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist
+ * will be processed at the flush time to create the update tree.
+ *
+ * NOTE: if the user needs the guarantee that the updater either add itself in
+ * the lockless list or the concurrent flusher flushes its updated stats, a
+ * memory barrier is needed before the call to css_rstat_updated() i.e. a
+ * barrier after updating the per-cpu stats and before calling
+ * css_rstat_updated().
*/
__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
- unsigned long flags;
+ struct llist_head *lhead;
+ struct css_rstat_cpu *rstatc;
+ struct css_rstat_cpu __percpu *rstatc_pcpu;
+ struct llist_node *self;
/*
* Since bpf programs can call this function, prevent access to
@@ -145,19 +81,49 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
if (!css_uses_rstat(css))
return;
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * For archs withnot nmi safe cmpxchg or percpu ops support, ignore
+ * the requests from nmi context.
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi())
+ return;
+
+ rstatc = css_rstat_cpu(css, cpu);
/*
- * Speculative already-on-list test. This may race leading to
- * temporary inaccuracies, which is fine.
+ * If already on list return. This check is racy and smp_mb() is needed
+ * to pair it with the smp_mb() in css_process_update_tree() if the
+ * guarantee that the updated stats are visible to concurrent flusher is
+ * needed.
+ */
+ if (llist_on_list(&rstatc->lnode))
+ return;
+
+ /*
+ * This function can be renentered by irqs and nmis for the same cgroup
+ * and may try to insert the same per-cpu lnode into the llist. Note
+ * that llist_add() does not protect against such scenarios.
*
- * Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @css is on the list by
- * testing the next pointer for NULL.
+ * To protect against such stacked contexts of irqs/nmis, we use the
+ * fact that lnode points to itself when not on a list and then use
+ * this_cpu_cmpxchg() to atomically set to NULL to select the winner
+ * which will call llist_add(). The losers can assume the insertion is
+ * successful and the winner will eventually add the per-cpu lnode to
+ * the llist.
*/
- if (data_race(css_rstat_cpu(css, cpu)->updated_next))
+ self = &rstatc->lnode;
+ rstatc_pcpu = css->rstat_cpu;
+ if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self)
return;
- flags = _css_rstat_cpu_lock(css, cpu, true);
+ lhead = ss_lhead_cpu(css->ss, cpu);
+ llist_add(&rstatc->lnode, lhead);
+}
+static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu)
+{
/* put @css and all ancestors on the corresponding updated lists */
while (true) {
struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
@@ -183,8 +149,34 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
css = parent;
}
+}
+
+static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
+{
+ struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
+ struct llist_node *lnode;
+
+ while ((lnode = llist_del_first_init(lhead))) {
+ struct css_rstat_cpu *rstatc;
+
+ /*
+ * smp_mb() is needed here (more specifically in between
+ * init_llist_node() and per-cpu stats flushing) if the
+ * guarantee is required by a rstat user where etiher the
+ * updater should add itself on the lockless list or the
+ * flusher flush the stats updated by the updater who have
+ * observed that they are already on the list. The
+ * corresponding barrier pair for this one should be before
+ * css_rstat_updated() by the user.
+ *
+ * For now, there aren't any such user, so not adding the
+ * barrier here but if such a use-case arise, please add
+ * smp_mb() here.
+ */
- _css_rstat_cpu_unlock(css, cpu, flags, true);
+ rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
+ __css_process_update_tree(rstatc->owner, cpu);
+ }
}
/**
@@ -288,13 +280,12 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
{
struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
struct cgroup_subsys_state *head = NULL, *parent, *child;
- unsigned long flags;
- flags = _css_rstat_cpu_lock(root, cpu, false);
+ css_process_update_tree(root->ss, cpu);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
- goto unlock_ret;
+ return NULL;
/*
* Unlink @root from its parent. As the updated_children list is
@@ -326,8 +317,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
rstatc->updated_children = root;
if (child != root)
head = css_rstat_push_children(head, child, cpu);
-unlock_ret:
- _css_rstat_cpu_unlock(root, cpu, flags, false);
+
return head;
}
@@ -468,7 +458,8 @@ int css_rstat_init(struct cgroup_subsys_state *css)
for_each_possible_cpu(cpu) {
struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
- rstatc->updated_children = css;
+ rstatc->owner = rstatc->updated_children = css;
+ init_llist_node(&rstatc->lnode);
if (is_self) {
struct cgroup_rstat_base_cpu *rstatbc;
@@ -522,19 +513,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
{
int cpu;
- /*
- * Depending on config, the subsystem per-cpu lock type may be an empty
- * struct. Avoid allocating a size of zero in this case.
- */
- if (ss && sizeof(*ss->rstat_ss_cpu_lock)) {
- ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t);
- if (!ss->rstat_ss_cpu_lock)
+ if (ss) {
+ ss->lhead = alloc_percpu(struct llist_head);
+ if (!ss->lhead)
return -ENOMEM;
}
spin_lock_init(ss_rstat_lock(ss));
for_each_possible_cpu(cpu)
- raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu));
+ init_llist_head(ss_lhead_cpu(ss, cpu));
return 0;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4c965ba77f9f..f774367c8e71 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -436,8 +436,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma,
* there are no unexpected folio references ...
*/
if (is_register || userfaultfd_missing(vma) ||
- (folio_ref_count(folio) != folio_mapcount(folio) + 1 +
- folio_test_swapcache(folio) * folio_nr_pages(folio)))
+ (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1))
goto remap;
/*
@@ -540,7 +539,7 @@ retry:
}
ret = 0;
- if (unlikely(!folio_test_anon(folio))) {
+ if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) {
VM_WARN_ON_ONCE(is_register);
folio_put(folio);
goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index bb184a67ac73..1d8c8ac33c4f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,6 +68,7 @@
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
+#include <linux/unwind_deferred.h>
#include <linux/uaccess.h>
#include <linux/pidfs.h>
@@ -938,6 +939,7 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
+ unwind_deferred_task_exit(tsk);
trace_sched_process_exit(tsk, group_dead);
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 52901fe4a3c2..e45354cc7cac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>
#include <linux/tick.h>
+#include <linux/unwind_deferred.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -732,6 +733,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ unwind_task_free(tsk);
sched_ext_free(tsk);
io_uring_free(tsk);
cgroup_free(tsk);
@@ -2135,6 +2137,8 @@ __latent_entropy struct task_struct *copy_process(
p->bpf_ctx = NULL;
#endif
+ unwind_task_init(p);
+
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 624106e886ad..0d0276378c70 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -611,7 +611,13 @@ void handle_simple_irq(struct irq_desc *desc)
{
guard(raw_spinlock)(&desc->lock);
- if (!irq_can_handle(desc))
+ if (!irq_can_handle_pm(desc)) {
+ if (irqd_needs_resend_when_in_progress(&desc->irq_data))
+ desc->istate |= IRQS_PENDING;
+ return;
+ }
+
+ if (!irq_can_handle_actions(desc))
return;
kstat_incr_irqs_this_cpu(desc);
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index d3b13a909913..e49743ae52c5 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -1100,8 +1100,8 @@ static void __init kho_release_scratch(void)
ulong pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
- set_pageblock_migratetype(pfn_to_page(pfn),
- MIGRATE_CMA);
+ init_pageblock_migratetype(pfn_to_page(pfn),
+ MIGRATE_CMA, false);
}
}
diff --git a/kernel/padata.c b/kernel/padata.c
index 7eee94166357..f85f8bd788d0 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -63,17 +63,6 @@ static inline void padata_put_pd(struct parallel_data *pd)
padata_put_pd_cnt(pd, 1);
}
-static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
-{
- int cpu, target_cpu;
-
- target_cpu = cpumask_first(pd->cpumask.pcpu);
- for (cpu = 0; cpu < cpu_index; cpu++)
- target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
-
- return target_cpu;
-}
-
static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
{
/*
@@ -82,7 +71,7 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
*/
int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
- return padata_index_to_cpu(pd, cpu_index);
+ return cpumask_nth(cpu_index, pd->cpumask.pcpu);
}
static struct padata_work *padata_work_alloc(void)
@@ -192,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu)
{
struct padata_instance *pinst = ps->pinst;
- int i, cpu, cpu_index, err;
struct parallel_data *pd;
struct padata_work *pw;
+ int cpu_index, err;
rcu_read_lock_bh();
@@ -210,12 +199,7 @@ int padata_do_parallel(struct padata_shell *ps,
/* Select an alternate fallback CPU and notify the caller. */
cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu);
-
- cpu = cpumask_first(pd->cpumask.cbcpu);
- for (i = 0; i < cpu_index; i++)
- cpu = cpumask_next(cpu, pd->cpumask.cbcpu);
-
- *cb_cpu = cpu;
+ *cb_cpu = cpumask_nth(cpu_index, pd->cpumask.cbcpu);
}
err = -EBUSY;
@@ -261,20 +245,17 @@ EXPORT_SYMBOL(padata_do_parallel);
* be parallel processed by another cpu and is not yet present in
* the cpu's reorder queue.
*/
-static struct padata_priv *padata_find_next(struct parallel_data *pd,
- bool remove_object)
+static struct padata_priv *padata_find_next(struct parallel_data *pd, int cpu,
+ unsigned int processed)
{
struct padata_priv *padata;
struct padata_list *reorder;
- int cpu = pd->cpu;
reorder = per_cpu_ptr(pd->reorder_list, cpu);
spin_lock(&reorder->lock);
- if (list_empty(&reorder->list)) {
- spin_unlock(&reorder->lock);
- return NULL;
- }
+ if (list_empty(&reorder->list))
+ goto notfound;
padata = list_entry(reorder->list.next, struct padata_priv, list);
@@ -282,97 +263,52 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
* Checks the rare case where two or more parallel jobs have hashed to
* the same CPU and one of the later ones finishes first.
*/
- if (padata->seq_nr != pd->processed) {
- spin_unlock(&reorder->lock);
- return NULL;
- }
-
- if (remove_object) {
- list_del_init(&padata->list);
- ++pd->processed;
- pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
- }
+ if (padata->seq_nr != processed)
+ goto notfound;
+ list_del_init(&padata->list);
spin_unlock(&reorder->lock);
return padata;
+
+notfound:
+ pd->processed = processed;
+ pd->cpu = cpu;
+ spin_unlock(&reorder->lock);
+ return NULL;
}
-static void padata_reorder(struct parallel_data *pd)
+static void padata_reorder(struct padata_priv *padata)
{
+ struct parallel_data *pd = padata->pd;
struct padata_instance *pinst = pd->ps->pinst;
- int cb_cpu;
- struct padata_priv *padata;
- struct padata_serial_queue *squeue;
- struct padata_list *reorder;
+ unsigned int processed;
+ int cpu;
- /*
- * We need to ensure that only one cpu can work on dequeueing of
- * the reorder queue the time. Calculating in which percpu reorder
- * queue the next object will arrive takes some time. A spinlock
- * would be highly contended. Also it is not clear in which order
- * the objects arrive to the reorder queues. So a cpu could wait to
- * get the lock just to notice that there is nothing to do at the
- * moment. Therefore we use a trylock and let the holder of the lock
- * care for all the objects enqueued during the holdtime of the lock.
- */
- if (!spin_trylock_bh(&pd->lock))
- return;
+ processed = pd->processed;
+ cpu = pd->cpu;
- while (1) {
- padata = padata_find_next(pd, true);
+ do {
+ struct padata_serial_queue *squeue;
+ int cb_cpu;
- /*
- * If the next object that needs serialization is parallel
- * processed by another cpu and is still on it's way to the
- * cpu's reorder queue, nothing to do for now.
- */
- if (!padata)
- break;
+ cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
+ processed++;
cb_cpu = padata->cb_cpu;
squeue = per_cpu_ptr(pd->squeue, cb_cpu);
spin_lock(&squeue->serial.lock);
list_add_tail(&padata->list, &squeue->serial.list);
- spin_unlock(&squeue->serial.lock);
-
queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work);
- }
-
- spin_unlock_bh(&pd->lock);
-
- /*
- * The next object that needs serialization might have arrived to
- * the reorder queues in the meantime.
- *
- * Ensure reorder queue is read after pd->lock is dropped so we see
- * new objects from another task in padata_do_serial. Pairs with
- * smp_mb in padata_do_serial.
- */
- smp_mb();
- reorder = per_cpu_ptr(pd->reorder_list, pd->cpu);
- if (!list_empty(&reorder->list) && padata_find_next(pd, false)) {
/*
- * Other context(eg. the padata_serial_worker) can finish the request.
- * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish.
+ * If the next object that needs serialization is parallel
+ * processed by another cpu and is still on it's way to the
+ * cpu's reorder queue, end the loop.
*/
- padata_get_pd(pd);
- if (!queue_work(pinst->serial_wq, &pd->reorder_work))
- padata_put_pd(pd);
- }
-}
-
-static void invoke_padata_reorder(struct work_struct *work)
-{
- struct parallel_data *pd;
-
- local_bh_disable();
- pd = container_of(work, struct parallel_data, reorder_work);
- padata_reorder(pd);
- local_bh_enable();
- /* Pairs with putting the reorder_work in the serial_wq */
- padata_put_pd(pd);
+ padata = padata_find_next(pd, cpu, processed);
+ spin_unlock(&squeue->serial.lock);
+ } while (padata);
}
static void padata_serial_worker(struct work_struct *serial_work)
@@ -423,6 +359,7 @@ void padata_do_serial(struct padata_priv *padata)
struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
struct padata_priv *cur;
struct list_head *pos;
+ bool gotit = true;
spin_lock(&reorder->lock);
/* Sort in ascending order of sequence number. */
@@ -432,17 +369,14 @@ void padata_do_serial(struct padata_priv *padata)
if ((signed int)(cur->seq_nr - padata->seq_nr) < 0)
break;
}
- list_add(&padata->list, pos);
+ if (padata->seq_nr != pd->processed) {
+ gotit = false;
+ list_add(&padata->list, pos);
+ }
spin_unlock(&reorder->lock);
- /*
- * Ensure the addition to the reorder list is ordered correctly
- * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb
- * in padata_reorder.
- */
- smp_mb();
-
- padata_reorder(pd);
+ if (gotit)
+ padata_reorder(padata);
}
EXPORT_SYMBOL(padata_do_serial);
@@ -632,9 +566,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
padata_init_squeues(pd);
pd->seq_nr = -1;
refcount_set(&pd->refcnt, 1);
- spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
- INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
return pd;
@@ -1144,12 +1076,6 @@ void padata_free_shell(struct padata_shell *ps)
if (!ps)
return;
- /*
- * Wait for all _do_serial calls to finish to avoid touching
- * freed pd's and ps's.
- */
- synchronize_rcu();
-
mutex_lock(&ps->pinst->lock);
list_del(&ps->list);
pd = rcu_dereference_protected(ps->pd, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 9216e3b91d3b..1f1f30cca573 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -381,6 +381,23 @@ static int create_image(int platform_mode)
return error;
}
+static void shrink_shmem_memory(void)
+{
+ struct sysinfo info;
+ unsigned long nr_shmem_pages, nr_freed_pages;
+
+ si_meminfo(&info);
+ nr_shmem_pages = info.sharedram; /* current page count used for shmem */
+ /*
+ * The intent is to reclaim all shmem pages. Though shrink_all_memory() can
+ * only reclaim about half of them, it's enough for creating the hibernation
+ * image.
+ */
+ nr_freed_pages = shrink_all_memory(nr_shmem_pages);
+ pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n",
+ nr_shmem_pages, nr_freed_pages);
+}
+
/**
* hibernation_snapshot - Quiesce devices and create a hibernation image.
* @platform_mode: If set, use platform driver to prepare for the transition.
@@ -422,6 +439,15 @@ int hibernation_snapshot(int platform_mode)
goto Thaw;
}
+ /*
+ * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem
+ * pages will use lots of system memory, causing hibernation image creation
+ * fail due to insufficient free memory.
+ * This call is to force flush the shmem pages to swap disk and reclaim
+ * the system memory so that image creation can succeed.
+ */
+ shrink_shmem_memory();
+
console_suspend_all();
error = dpm_suspend(PMSG_FREEZE);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 2a5a5329322d..d16afeb11506 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -163,6 +163,13 @@ static void panic_on_rcu_stall(void)
{
static int cpu_stall;
+ /*
+ * Attempt to kick out the BPF scheduler if it's installed and defer
+ * the panic to give the system a chance to recover.
+ */
+ if (scx_rcu_cpu_stall())
+ return;
+
if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
return;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 66d93a872968..be00629f0ba4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9815,7 +9815,9 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
return 0;
}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
/* More than 203 days if BW_SHIFT equals 20. */
@@ -9824,12 +9826,21 @@ static const u64 max_bw_runtime_us = MAX_BW;
static void tg_bandwidth(struct task_group *tg,
u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
{
+#ifdef CONFIG_CFS_BANDWIDTH
if (period_us_p)
*period_us_p = tg_get_cfs_period(tg);
if (quota_us_p)
*quota_us_p = tg_get_cfs_quota(tg);
if (burst_us_p)
*burst_us_p = tg_get_cfs_burst(tg);
+#else /* !CONFIG_CFS_BANDWIDTH */
+ if (period_us_p)
+ *period_us_p = tg->scx.bw_period_us;
+ if (quota_us_p)
+ *quota_us_p = tg->scx.bw_quota_us;
+ if (burst_us_p)
+ *burst_us_p = tg->scx.bw_burst_us;
+#endif /* CONFIG_CFS_BANDWIDTH */
}
static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
@@ -9845,6 +9856,7 @@ static int tg_set_bandwidth(struct task_group *tg,
u64 period_us, u64 quota_us, u64 burst_us)
{
const u64 max_usec = U64_MAX / NSEC_PER_USEC;
+ int ret = 0;
if (tg == &root_task_group)
return -EINVAL;
@@ -9882,7 +9894,12 @@ static int tg_set_bandwidth(struct task_group *tg,
burst_us + quota_us > max_bw_runtime_us))
return -EINVAL;
- return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#ifdef CONFIG_CFS_BANDWIDTH
+ ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#endif /* CONFIG_CFS_BANDWIDTH */
+ if (!ret)
+ scx_group_set_bandwidth(tg, period_us, quota_us, burst_us);
+ return ret;
}
static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
@@ -9935,7 +9952,7 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
tg_bandwidth(tg, &period_us, &quota_us, NULL);
return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
}
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -9995,7 +10012,7 @@ static struct cftype cpu_legacy_files[] = {
.write_s64 = cpu_idle_write_s64,
},
#endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
{
.name = "cfs_period_us",
.read_u64 = cpu_period_read_u64,
@@ -10011,6 +10028,8 @@ static struct cftype cpu_legacy_files[] = {
.read_u64 = cpu_burst_read_u64,
.write_u64 = cpu_burst_write_u64,
},
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
{
.name = "stat",
.seq_show = cpu_cfs_stat_show,
@@ -10224,7 +10243,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
return 0;
}
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
static int cpu_max_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
@@ -10271,7 +10290,7 @@ static struct cftype cpu_files[] = {
.write_s64 = cpu_idle_write_s64,
},
#endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7dd5cbcb7a06..7dedc9a16281 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -203,6 +203,11 @@ struct scx_exit_task_args {
struct scx_cgroup_init_args {
/* the weight of the cgroup [1..10000] */
u32 weight;
+
+ /* bandwidth control parameters from cpu.max and cpu.max.burst */
+ u64 bw_period_us;
+ u64 bw_quota_us;
+ u64 bw_burst_us;
};
enum scx_cpu_preempt_reason {
@@ -664,9 +669,31 @@ struct sched_ext_ops {
* @cgrp: cgroup whose weight is being updated
* @weight: new weight [1..10000]
*
- * Update @tg's weight to @weight.
+ * Update @cgrp's weight to @weight.
*/
void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+ /**
+ * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+ * @cgrp: cgroup whose bandwidth is being updated
+ * @period_us: bandwidth control period
+ * @quota_us: bandwidth control quota
+ * @burst_us: bandwidth control burst
+ *
+ * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+ * cgroup interface.
+ *
+ * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+ * to. For example, if @period_us is 1_000_000 and @quota_us is
+ * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+ * interpreted in the same fashion and specifies how much @cgrp can
+ * burst temporarily. The specific control mechanism and thus the
+ * interpretation of @period_us and burstiness is upto to the BPF
+ * scheduler.
+ */
+ void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us);
+
#endif /* CONFIG_EXT_GROUP_SCHED */
/*
@@ -884,7 +911,7 @@ enum scx_enq_flags {
/*
* The task being enqueued was previously enqueued on the current CPU's
* %SCX_DSQ_LOCAL, but was removed from it in a call to the
- * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
+ * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
* invoked in a ->cpu_release() callback, and the task is again
* dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
* task will not be scheduled on the CPU until at least the next invocation
@@ -1247,7 +1274,7 @@ static void scx_kf_disallow(u32 mask)
* This allows kfuncs to safely operate on rq from any scx ops callback,
* knowing which rq is already locked.
*/
-static DEFINE_PER_CPU(struct rq *, locked_rq);
+DEFINE_PER_CPU(struct rq *, scx_locked_rq_state);
static inline void update_locked_rq(struct rq *rq)
{
@@ -1258,16 +1285,7 @@ static inline void update_locked_rq(struct rq *rq)
*/
if (rq)
lockdep_assert_rq_held(rq);
- __this_cpu_write(locked_rq, rq);
-}
-
-/*
- * Return the rq currently locked from an scx callback, or NULL if no rq is
- * locked.
- */
-static inline struct rq *scx_locked_rq(void)
-{
- return __this_cpu_read(locked_rq);
+ __this_cpu_write(scx_locked_rq_state, rq);
}
#define SCX_CALL_OP(sch, mask, op, rq, args...) \
@@ -1641,7 +1659,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* scx_add_event - Increase an event counter for 'name' by 'cnt'
* @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
- * @cnt: the number of the event occured
+ * @cnt: the number of the event occurred
*
* This can be used when preemption is not disabled.
*/
@@ -1654,7 +1672,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* __scx_add_event - Increase an event counter for 'name' by 'cnt'
* @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
- * @cnt: the number of the event occured
+ * @cnt: the number of the event occurred
*
* This should be used only when preemption is disabled.
*/
@@ -1705,11 +1723,6 @@ static bool scx_tryset_enable_state(enum scx_enable_state to,
return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to);
}
-static bool scx_rq_bypassing(struct rq *rq)
-{
- return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
-}
-
/**
* wait_ops_state - Busy-wait the specified ops state to end
* @p: target task
@@ -1796,12 +1809,10 @@ static void run_deferred(struct rq *rq)
process_ddsp_deferred_locals(rq);
}
-#ifdef CONFIG_SMP
static void deferred_bal_cb_workfn(struct rq *rq)
{
run_deferred(rq);
}
-#endif
static void deferred_irq_workfn(struct irq_work *irq_work)
{
@@ -1824,7 +1835,6 @@ static void schedule_deferred(struct rq *rq)
{
lockdep_assert_rq_held(rq);
-#ifdef CONFIG_SMP
/*
* If in the middle of waking up a task, task_woken_scx() will be called
* afterwards which will then run the deferred actions, no need to
@@ -1842,7 +1852,7 @@ static void schedule_deferred(struct rq *rq)
deferred_bal_cb_workfn);
return;
}
-#endif
+
/*
* No scheduler hooks available. Queue an irq work. They are executed on
* IRQ re-enable which may take a bit longer than the scheduler hooks.
@@ -2546,7 +2556,6 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
p->scx.dsq = dst_dsq;
}
-#ifdef CONFIG_SMP
/**
* move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
* @p: task to move
@@ -2713,11 +2722,6 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
return false;
}
}
-#else /* CONFIG_SMP */
-static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { return false; }
-static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
-#endif /* CONFIG_SMP */
/**
* move_task_between_dsqs() - Move a task from one DSQ to another
@@ -2890,9 +2894,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
{
struct rq *src_rq = task_rq(p);
struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
-#ifdef CONFIG_SMP
struct rq *locked_rq = rq;
-#endif
/*
* We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2906,7 +2908,6 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
return;
}
-#ifdef CONFIG_SMP
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
dispatch_enqueue(sch, find_global_dsq(p), p,
@@ -2966,9 +2967,6 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
raw_spin_rq_unlock(locked_rq);
raw_spin_rq_lock(rq);
}
-#else /* CONFIG_SMP */
- BUG(); /* control can not reach here on UP */
-#endif /* CONFIG_SMP */
}
/**
@@ -3292,10 +3290,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
static enum scx_cpu_preempt_reason
preempt_reason_from_class(const struct sched_class *class)
{
-#ifdef CONFIG_SMP
if (class == &stop_sched_class)
return SCX_CPU_PREEMPT_STOP;
-#endif
if (class == &dl_sched_class)
return SCX_CPU_PREEMPT_DL;
if (class == &rt_sched_class)
@@ -3308,14 +3304,12 @@ static void switch_class(struct rq *rq, struct task_struct *next)
struct scx_sched *sch = scx_root;
const struct sched_class *next_class = next->sched_class;
-#ifdef CONFIG_SMP
/*
* Pairs with the smp_load_acquire() issued by a CPU in
* kick_cpus_irq_workfn() who is waiting for this CPU to perform a
* resched.
*/
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
-#endif
if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
return;
@@ -3512,8 +3506,6 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
}
#endif /* CONFIG_SCHED_CORE */
-#ifdef CONFIG_SMP
-
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
struct scx_sched *sch = scx_root;
@@ -3643,7 +3635,6 @@ static void rq_offline_scx(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_ONLINE;
}
-#endif /* CONFIG_SMP */
static bool check_rq_for_timeouts(struct rq *rq)
{
@@ -4098,7 +4089,9 @@ static bool scx_cgroup_enabled;
void scx_tg_init(struct task_group *tg)
{
- tg->scx_weight = CGROUP_WEIGHT_DFL;
+ tg->scx.weight = CGROUP_WEIGHT_DFL;
+ tg->scx.bw_period_us = default_bw_period_us();
+ tg->scx.bw_quota_us = RUNTIME_INF;
}
int scx_tg_online(struct task_group *tg)
@@ -4106,14 +4099,17 @@ int scx_tg_online(struct task_group *tg)
struct scx_sched *sch = scx_root;
int ret = 0;
- WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
+ WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled) {
if (SCX_HAS_OP(sch, cgroup_init)) {
struct scx_cgroup_init_args args =
- { .weight = tg->scx_weight };
+ { .weight = tg->scx.weight,
+ .bw_period_us = tg->scx.bw_period_us,
+ .bw_quota_us = tg->scx.bw_quota_us,
+ .bw_burst_us = tg->scx.bw_burst_us };
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
NULL, tg->css.cgroup, &args);
@@ -4121,9 +4117,9 @@ int scx_tg_online(struct task_group *tg)
ret = ops_sanitize_err(sch, "cgroup_init", ret);
}
if (ret == 0)
- tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
+ tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED;
} else {
- tg->scx_flags |= SCX_TG_ONLINE;
+ tg->scx.flags |= SCX_TG_ONLINE;
}
percpu_up_read(&scx_cgroup_rwsem);
@@ -4134,15 +4130,15 @@ void scx_tg_offline(struct task_group *tg)
{
struct scx_sched *sch = scx_root;
- WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
+ WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
- (tg->scx_flags & SCX_TG_INITED))
+ (tg->scx.flags & SCX_TG_INITED))
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
tg->css.cgroup);
- tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
+ tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
percpu_up_read(&scx_cgroup_rwsem);
}
@@ -4251,11 +4247,11 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
- tg->scx_weight != weight)
+ tg->scx.weight != weight)
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
tg_cgrp(tg), weight);
- tg->scx_weight = weight;
+ tg->scx.weight = weight;
percpu_up_read(&scx_cgroup_rwsem);
}
@@ -4265,6 +4261,27 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
/* TODO: Implement ops->cgroup_set_idle() */
}
+void scx_group_set_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{
+ struct scx_sched *sch = scx_root;
+
+ percpu_down_read(&scx_cgroup_rwsem);
+
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
+ (tg->scx.bw_period_us != period_us ||
+ tg->scx.bw_quota_us != quota_us ||
+ tg->scx.bw_burst_us != burst_us))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
+ tg_cgrp(tg), period_us, quota_us, burst_us);
+
+ tg->scx.bw_period_us = period_us;
+ tg->scx.bw_quota_us = quota_us;
+ tg->scx.bw_burst_us = burst_us;
+
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
static void scx_cgroup_lock(void)
{
percpu_down_write(&scx_cgroup_rwsem);
@@ -4308,14 +4325,12 @@ DEFINE_SCHED_CLASS(ext) = {
.put_prev_task = put_prev_task_scx,
.set_next_task = set_next_task_scx,
-#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_scx,
.task_woken = task_woken_scx,
.set_cpus_allowed = set_cpus_allowed_scx,
.rq_online = rq_online_scx,
.rq_offline = rq_offline_scx,
-#endif
.task_tick = task_tick_scx,
@@ -4408,9 +4423,9 @@ static void scx_cgroup_exit(struct scx_sched *sch)
css_for_each_descendant_post(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
- if (!(tg->scx_flags & SCX_TG_INITED))
+ if (!(tg->scx.flags & SCX_TG_INITED))
continue;
- tg->scx_flags &= ~SCX_TG_INITED;
+ tg->scx.flags &= ~SCX_TG_INITED;
if (!sch->ops.cgroup_exit)
continue;
@@ -4442,14 +4457,19 @@ static int scx_cgroup_init(struct scx_sched *sch)
rcu_read_lock();
css_for_each_descendant_pre(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
- struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+ struct scx_cgroup_init_args args = {
+ .weight = tg->scx.weight,
+ .bw_period_us = tg->scx.bw_period_us,
+ .bw_quota_us = tg->scx.bw_quota_us,
+ .bw_burst_us = tg->scx.bw_burst_us,
+ };
- if ((tg->scx_flags &
+ if ((tg->scx.flags &
(SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
continue;
if (!sch->ops.cgroup_init) {
- tg->scx_flags |= SCX_TG_INITED;
+ tg->scx.flags |= SCX_TG_INITED;
continue;
}
@@ -4464,7 +4484,7 @@ static int scx_cgroup_init(struct scx_sched *sch)
scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
return ret;
}
- tg->scx_flags |= SCX_TG_INITED;
+ tg->scx.flags |= SCX_TG_INITED;
rcu_read_lock();
css_put(css);
@@ -4657,6 +4677,41 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
}
/**
+ * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ *
+ * While there are various reasons why RCU CPU stalls can occur on a system
+ * that may not be caused by the current BPF scheduler, try kicking out the
+ * current scheduler in an attempt to recover the system to a good state before
+ * issuing panics.
+ */
+bool scx_rcu_cpu_stall(void)
+{
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch)) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ switch (scx_enable_state()) {
+ case SCX_ENABLING:
+ case SCX_ENABLED:
+ break;
+ default:
+ rcu_read_unlock();
+ return false;
+ }
+
+ scx_error(sch, "RCU CPU stall detected!");
+ rcu_read_unlock();
+
+ return true;
+}
+
+/**
* scx_softlockup - sched_ext softlockup handler
* @dur_s: number of seconds of CPU stuck due to soft lockup
*
@@ -5944,6 +5999,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup
static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
+static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
#endif
static void sched_ext_ops__cpu_online(s32 cpu) {}
static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -5981,6 +6037,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
.cgroup_move = sched_ext_ops__cgroup_move,
.cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
.cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
+ .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
#endif
.cpu_online = sched_ext_ops__cpu_online,
.cpu_offline = sched_ext_ops__cpu_offline,
@@ -6338,7 +6395,8 @@ __bpf_kfunc_start_defs();
* When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
* and this function can be called upto ops.dispatch_max_batch times to insert
* multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
- * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
+ * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the
+ * counter.
*
* This function doesn't have any locking restrictions and may be called under
* BPF locks (in the future when BPF introduces more flexible locking).
@@ -6362,14 +6420,6 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
scx_dsq_insert_commit(p, dsq_id, enq_flags);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()");
- scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);
-}
-
/**
* scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
* @p: task_struct to insert
@@ -6407,21 +6457,11 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
- u64 slice, u64 vtime, u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()");
- scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);
-}
-
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
@@ -6594,13 +6634,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
}
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()");
- return scx_bpf_dsq_move_to_local(dsq_id);
-}
-
/**
* scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
* @it__iter: DSQ iterator in progress
@@ -6619,14 +6652,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
- struct bpf_iter_scx_dsq *it__iter, u64 slice)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()");
- scx_bpf_dsq_move_set_slice(it__iter, slice);
-}
-
/**
* scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
* @it__iter: DSQ iterator in progress
@@ -6646,14 +6671,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
- struct bpf_iter_scx_dsq *it__iter, u64 vtime)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()");
- scx_bpf_dsq_move_set_vtime(it__iter, vtime);
-}
-
/**
* scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
* @it__iter: DSQ iterator in progress
@@ -6686,15 +6703,6 @@ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
- struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()");
- return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags);
-}
-
/**
* scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
* @it__iter: DSQ iterator in progress
@@ -6720,30 +6728,16 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
- struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()");
- return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags);
-}
-
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
-BTF_ID_FLAGS(func, scx_bpf_consume)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
@@ -6874,10 +6868,6 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index a75835c23f15..292bb41a242e 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -13,8 +13,24 @@ static inline bool scx_kf_allowed_if_unlocked(void)
return !current->scx.kf_mask;
}
+static inline bool scx_rq_bypassing(struct rq *rq)
+{
+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
+
DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
+DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(scx_locked_rq_state);
+}
+
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -65,7 +81,7 @@ static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
-#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
+#ifdef CONFIG_SCHED_CLASS_EXT
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
@@ -88,6 +104,7 @@ void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
+void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us);
#else /* CONFIG_EXT_GROUP_SCHED */
static inline void scx_tg_init(struct task_group *tg) {}
static inline int scx_tg_online(struct task_group *tg) { return 0; }
@@ -98,5 +115,6 @@ static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
+static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {}
#endif /* CONFIG_EXT_GROUP_SCHED */
#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 001fb88a8481..7174e1c1a392 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -17,7 +17,6 @@ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
/* Enable/disable per-node idle cpumasks */
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
-#ifdef CONFIG_SMP
/* Enable/disable LLC aware optimizations */
static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
@@ -75,7 +74,7 @@ static int scx_cpu_node_if_enabled(int cpu)
return cpu_to_node(cpu);
}
-bool scx_idle_test_and_clear_cpu(int cpu)
+static bool scx_idle_test_and_clear_cpu(int cpu)
{
int node = scx_cpu_node_if_enabled(cpu);
struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
@@ -198,7 +197,7 @@ pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u6
/*
* Find an idle CPU in the system, starting from @node.
*/
-s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
{
s32 cpu;
@@ -250,7 +249,7 @@ static struct cpumask *llc_span(s32 cpu)
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd)
- return 0;
+ return NULL;
return sched_domain_span(sd);
}
@@ -794,7 +793,6 @@ static void reset_idle_masks(struct sched_ext_ops *ops)
cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask);
}
}
-#endif /* CONFIG_SMP */
void scx_idle_enable(struct sched_ext_ops *ops)
{
@@ -808,9 +806,7 @@ void scx_idle_enable(struct sched_ext_ops *ops)
else
static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
-#ifdef CONFIG_SMP
reset_idle_masks(ops);
-#endif
}
void scx_idle_disable(void)
@@ -860,8 +856,8 @@ static bool check_builtin_idle_enabled(void)
return false;
}
-s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
- const struct cpumask *allowed, u64 flags)
+static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *allowed, u64 flags)
{
struct rq *rq;
struct rq_flags rf;
@@ -896,7 +892,6 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
if (!rq)
lockdep_assert_held(&p->pi_lock);
-#ifdef CONFIG_SMP
/*
* This may also be called from ops.enqueue(), so we need to handle
* per-CPU tasks as well. For these tasks, we can skip all idle CPU
@@ -913,9 +908,7 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags,
allowed ?: p->cpus_ptr, flags);
}
-#else
- cpu = -EBUSY;
-#endif
+
if (scx_kf_allowed_if_unlocked())
task_rq_unlock(rq, p, &rf);
@@ -929,14 +922,10 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
*/
__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
{
-#ifdef CONFIG_NUMA
if (!kf_cpu_valid(cpu, NULL))
return NUMA_NO_NODE;
return cpu_to_node(cpu);
-#else
- return 0;
-#endif
}
/**
@@ -1010,11 +999,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
if (node < 0)
return cpu_none_mask;
-#ifdef CONFIG_SMP
return idle_cpumask(node)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1034,11 +1019,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
if (!check_builtin_idle_enabled())
return cpu_none_mask;
-#ifdef CONFIG_SMP
return idle_cpumask(NUMA_NO_NODE)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1057,14 +1038,10 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
if (node < 0)
return cpu_none_mask;
-#ifdef CONFIG_SMP
if (sched_smt_active())
return idle_cpumask(node)->smt;
else
return idle_cpumask(node)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1085,14 +1062,10 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
if (!check_builtin_idle_enabled())
return cpu_none_mask;
-#ifdef CONFIG_SMP
if (sched_smt_active())
return idle_cpumask(NUMA_NO_NODE)->smt;
else
return idle_cpumask(NUMA_NO_NODE)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1125,10 +1098,10 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
if (!check_builtin_idle_enabled())
return false;
- if (kf_cpu_valid(cpu, NULL))
- return scx_idle_test_and_clear_cpu(cpu);
- else
+ if (!kf_cpu_valid(cpu, NULL))
return false;
+
+ return scx_idle_test_and_clear_cpu(cpu);
}
/**
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index 37be78a7502b..fa583f141f35 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -12,20 +12,8 @@
struct sched_ext_ops;
-#ifdef CONFIG_SMP
void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
void scx_idle_init_masks(void);
-bool scx_idle_test_and_clear_cpu(int cpu);
-s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags);
-#else /* !CONFIG_SMP */
-static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {}
-static inline void scx_idle_init_masks(void) {}
-static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; }
-static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
-{
- return -EBUSY;
-}
-#endif /* CONFIG_SMP */
s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d3f33d10c58c..be9745d104f7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -403,7 +403,7 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se)
extern struct list_head task_groups;
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
extern const u64 max_bw_quota_period_us;
/*
@@ -414,7 +414,7 @@ static inline u64 default_bw_period_us(void)
{
return 100000ULL;
}
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
@@ -472,10 +472,7 @@ struct task_group {
struct rt_bandwidth rt_bandwidth;
#endif
-#ifdef CONFIG_EXT_GROUP_SCHED
- u32 scx_flags; /* SCX_TG_* */
- u32 scx_weight;
-#endif
+ struct scx_task_group scx;
struct rcu_head rcu;
struct list_head list;
diff --git a/kernel/sys.c b/kernel/sys.c
index 18a037cc6f61..1e28b40053ce 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2372,54 +2372,14 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
-#ifdef CONFIG_ANON_VMA_NAME
-
-#define ANON_VMA_NAME_MAX_LEN 80
-#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
-
-static inline bool is_valid_name_char(char ch)
-{
- /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
- return ch > 0x1f && ch < 0x7f &&
- !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
-}
-
static int prctl_set_vma(unsigned long opt, unsigned long addr,
unsigned long size, unsigned long arg)
{
- struct mm_struct *mm = current->mm;
- const char __user *uname;
- struct anon_vma_name *anon_name = NULL;
int error;
switch (opt) {
case PR_SET_VMA_ANON_NAME:
- uname = (const char __user *)arg;
- if (uname) {
- char *name, *pch;
-
- name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
- if (IS_ERR(name))
- return PTR_ERR(name);
-
- for (pch = name; *pch != '\0'; pch++) {
- if (!is_valid_name_char(*pch)) {
- kfree(name);
- return -EINVAL;
- }
- }
- /* anon_vma has its own copy */
- anon_name = anon_vma_name_alloc(name);
- kfree(name);
- if (!anon_name)
- return -ENOMEM;
-
- }
-
- mmap_write_lock(mm);
- error = madvise_set_anon_name(mm, addr, size, anon_name);
- mmap_write_unlock(mm);
- anon_vma_name_put(anon_name);
+ error = set_anon_vma_name(addr, size, (const char __user *)arg);
break;
default:
error = -EINVAL;
@@ -2428,14 +2388,6 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
return error;
}
-#else /* CONFIG_ANON_VMA_NAME */
-static int prctl_set_vma(unsigned long opt, unsigned long start,
- unsigned long size, unsigned long arg)
-{
- return -EINVAL;
-}
-#endif /* CONFIG_ANON_VMA_NAME */
-
static inline unsigned long get_current_mdwe(void)
{
unsigned long ret = 0;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e400fe150f9d..0aef0e349e49 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -340,10 +340,7 @@ static void clocksource_verify_choose_cpus(void)
* CPUs that are currently online.
*/
for (i = 1; i < n; i++) {
- cpu = get_random_u32_below(nr_cpu_ids);
- cpu = cpumask_next(cpu - 1, cpu_online_mask);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(cpu_online_mask);
+ cpu = cpumask_random(cpu_online_mask);
if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
cpumask_set_cpu(cpu, &cpus_chosen);
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f135d04e1860..d2c79da81e4f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -200,6 +200,19 @@ menuconfig FTRACE
if FTRACE
+config TRACEFS_AUTOMOUNT_DEPRECATED
+ bool "Automount tracefs on debugfs [DEPRECATED]"
+ depends on TRACING
+ default y
+ help
+ The tracing interface was moved from /sys/kernel/debug/tracing
+ to /sys/kernel/tracing in 2015, but the tracing file system
+ was still automounted in /sys/kernel/debug for backward
+ compatibility with tooling.
+
+ The new interface has been around for more than 10 years and
+ the old debug mount will soon be removed.
+
config BOOTTIME_TRACING
bool "Boot-time Tracing support"
depends on TRACING
@@ -780,6 +793,20 @@ config UPROBE_EVENTS
This option is required if you plan to use perf-probe subcommand
of perf tools on user space applications.
+config EPROBE_EVENTS
+ bool "Enable event-based dynamic events"
+ depends on TRACING
+ depends on HAVE_REGS_AND_STACK_ACCESS_API
+ select PROBE_EVENTS
+ select DYNAMIC_EVENTS
+ default y
+ help
+ Eprobes are dynamic events that can be placed on other existing
+ events. It can be used to limit what fields are recorded in
+ an event or even dereference a field of an event. It can
+ convert the type of an event field. For example, turn an
+ address into a string.
+
config BPF_EVENTS
depends on BPF_SYSCALL
depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 057cd975d014..dcb4e02afc5f 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
-obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
+obj-$(CONFIG_EPROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 314ffc143039..acb0c971a408 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data)
{
int i;
int s = MIN(burst_size, NR_TEST_FUNCS);
- struct cpumask cpu_mask;
+ cpumask_var_t cpu_mask;
+
+ if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
if (cpu_affinity > -1) {
- cpumask_clear(&cpu_mask);
- cpumask_set_cpu(cpu_affinity, &cpu_mask);
- if (set_cpus_allowed_ptr(current, &cpu_mask))
+ cpumask_clear(cpu_mask);
+ cpumask_set_cpu(cpu_affinity, cpu_mask);
+ if (set_cpus_allowed_ptr(current, cpu_mask))
pr_err("cpu_affinity:%d, failed\n", cpu_affinity);
}
@@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data)
__set_current_state(TASK_RUNNING);
+ free_cpumask_var(cpu_mask);
+
return 0;
}
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index bd7d56dbf6c2..1482e91c39f4 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -674,8 +674,6 @@ static bool __read_mostly monitoring_on;
*/
bool rv_monitoring_on(void)
{
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_rmb();
return READ_ONCE(monitoring_on);
}
@@ -695,8 +693,6 @@ static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf,
static void turn_monitoring_off(void)
{
WRITE_ONCE(monitoring_on, false);
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_wmb();
}
static void reset_all_monitors(void)
@@ -712,8 +708,6 @@ static void reset_all_monitors(void)
static void turn_monitoring_on(void)
{
WRITE_ONCE(monitoring_on, true);
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_wmb();
}
static void turn_monitoring_on_with_reset(void)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7996f26c3f46..b9716178f728 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -936,7 +936,6 @@ int tracing_is_enabled(void)
* return the mirror variable of the state of the ring buffer.
* It's a little racy, but we don't really care.
*/
- smp_rmb();
return !global_trace.buffer_disabled;
}
@@ -1107,8 +1106,6 @@ void tracer_tracing_on(struct trace_array *tr)
* important to be fast than accurate.
*/
tr->buffer_disabled = 0;
- /* Make the flag seen by readers */
- smp_wmb();
}
/**
@@ -1640,8 +1637,6 @@ void tracer_tracing_off(struct trace_array *tr)
* important to be fast than accurate.
*/
tr->buffer_disabled = 1;
- /* Make the flag seen by readers */
- smp_wmb();
}
/**
@@ -2710,8 +2705,6 @@ void trace_buffered_event_enable(void)
static void enable_trace_buffered_event(void *data)
{
- /* Probably not needed, but do it anyway */
- smp_rmb();
this_cpu_dec(trace_buffered_event_cnt);
}
@@ -5931,17 +5924,27 @@ static inline void trace_insert_eval_map_file(struct module *mod,
struct trace_eval_map **start, int len) { }
#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */
-static void trace_insert_eval_map(struct module *mod,
- struct trace_eval_map **start, int len)
+static void
+trace_event_update_with_eval_map(struct module *mod,
+ struct trace_eval_map **start,
+ int len)
{
struct trace_eval_map **map;
- if (len <= 0)
- return;
+ /* Always run sanitizer only if btf_type_tag attr exists. */
+ if (len <= 0) {
+ if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) &&
+ IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) &&
+ __has_attribute(btf_type_tag)))
+ return;
+ }
map = start;
- trace_event_eval_update(map, len);
+ trace_event_update_all(map, len);
+
+ if (len <= 0)
+ return;
trace_insert_eval_map_file(mod, start, len);
}
@@ -6297,7 +6300,7 @@ static bool tracer_options_updated;
static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
/* Only enable if the directory has been created already. */
- if (!tr->dir)
+ if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL))
return;
/* Only create trace option files after update_tracer_options finish */
@@ -8978,13 +8981,13 @@ static inline __init int register_snapshot_cmd(void) { return 0; }
static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
- if (WARN_ON(!tr->dir))
- return ERR_PTR(-ENODEV);
-
/* Top directory uses NULL as the parent */
if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
return NULL;
+ if (WARN_ON(!tr->dir))
+ return ERR_PTR(-ENODEV);
+
/* All sub buffers have a descriptor */
return tr->dir;
}
@@ -10250,6 +10253,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
ftrace_init_tracefs(tr, d_tracer);
}
+#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED
static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
struct vfsmount *mnt;
@@ -10271,6 +10275,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
if (IS_ERR(fc))
return ERR_CAST(fc);
+ pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n");
+
ret = vfs_parse_fs_string(fc, "source",
"tracefs", strlen("tracefs"));
if (!ret)
@@ -10281,6 +10287,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
put_fs_context(fc);
return mnt;
}
+#endif
/**
* tracing_init_dentry - initialize top level trace array
@@ -10305,6 +10312,7 @@ int tracing_init_dentry(void)
if (WARN_ON(!tracefs_initialized()))
return -ENODEV;
+#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED
/*
* As there may still be users that expect the tracing
* files to exist in debugfs/tracing, we must automount
@@ -10313,6 +10321,7 @@ int tracing_init_dentry(void)
*/
tr->dir = debugfs_create_automount("tracing", NULL,
trace_automount, NULL);
+#endif
return 0;
}
@@ -10329,7 +10338,7 @@ static void __init eval_map_work_func(struct work_struct *work)
int len;
len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
- trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
+ trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len);
}
static int __init trace_eval_init(void)
@@ -10382,9 +10391,6 @@ bool module_exists(const char *module)
static void trace_module_add_evals(struct module *mod)
{
- if (!mod->num_trace_evals)
- return;
-
/*
* Modules with bad taint do not have events created, do
* not bother with enums either.
@@ -10392,7 +10398,8 @@ static void trace_module_add_evals(struct module *mod)
if (trace_module_has_bad_taint(mod))
return;
- trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
+ /* Even if no trace_evals, this need to sanitize field types. */
+ trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
}
#ifdef CONFIG_TRACE_EVAL_MAP_FILE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bd084953a98b..1dbf1d3cf2f1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -2125,13 +2125,13 @@ static inline const char *get_syscall_name(int syscall)
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
-void trace_event_eval_update(struct trace_eval_map **map, int len);
+void trace_event_update_all(struct trace_eval_map **map, int len);
/* Used from boot time tracer */
extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
extern int trigger_process_regex(struct trace_event_file *file, char *buff);
#else
static inline void __init trace_event_init(void) { }
-static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
+static inline void trace_event_update_all(struct trace_eval_map **map, int len) { }
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d01e5c910ce1..9f3e9537417d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -768,6 +768,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
+ bool soft_mode = atomic_read(&file->sm_ref) != 0;
int ret = 0;
int disable;
@@ -782,7 +783,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
* is set we do not want the event to be enabled before we
* clear the bit.
*
- * When soft_disable is not set but the SOFT_MODE flag is,
+ * When soft_disable is not set but the soft_mode is,
* we do nothing. Do not disable the tracepoint, otherwise
* "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
*/
@@ -790,11 +791,11 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (atomic_dec_return(&file->sm_ref) > 0)
break;
disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
- clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ soft_mode = false;
/* Disable use of trace_buffered_event */
trace_buffered_event_disable();
} else
- disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
+ disable = !soft_mode;
if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
@@ -812,8 +813,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
WARN_ON_ONCE(ret);
}
- /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
- if (file->flags & EVENT_FILE_FL_SOFT_MODE)
+ /* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */
+ if (soft_mode)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
else
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
@@ -823,7 +824,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
* When soft_disable is set and enable is set, we want to
* register the tracepoint for the event, but leave the event
* as is. That means, if the event was already enabled, we do
- * nothing (but set SOFT_MODE). If the event is disabled, we
+ * nothing (but set soft_mode). If the event is disabled, we
* set SOFT_DISABLED before enabling the event tracepoint, so
* it still seems to be disabled.
*/
@@ -832,7 +833,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
else {
if (atomic_inc_return(&file->sm_ref) > 1)
break;
- set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ soft_mode = true;
/* Enable use of trace_buffered_event */
trace_buffered_event_enable();
}
@@ -840,7 +841,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
bool cmd = false, tgid = false;
- /* Keep the event disabled, when going to SOFT_MODE. */
+ /* Keep the event disabled, when going to soft mode. */
if (soft_disable)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
@@ -1792,8 +1793,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
!(flags & EVENT_FILE_FL_SOFT_DISABLED))
strcpy(buf, "1");
- if (flags & EVENT_FILE_FL_SOFT_DISABLED ||
- flags & EVENT_FILE_FL_SOFT_MODE)
+ if (atomic_read(&file->sm_ref) != 0)
strcat(buf, "*");
strcat(buf, "\n");
@@ -3267,43 +3267,120 @@ static void add_str_to_module(struct module *module, char *str)
list_add(&modstr->next, &module_strings);
}
+#define ATTRIBUTE_STR "__attribute__("
+#define ATTRIBUTE_STR_LEN (sizeof(ATTRIBUTE_STR) - 1)
+
+/* Remove all __attribute__() from @type. Return allocated string or @type. */
+static char *sanitize_field_type(const char *type)
+{
+ char *attr, *tmp, *next, *ret = (char *)type;
+ int depth;
+
+ next = (char *)type;
+ while ((attr = strstr(next, ATTRIBUTE_STR))) {
+ /* Retry if "__attribute__(" is a part of another word. */
+ if (attr != next && !isspace(attr[-1])) {
+ next = attr + ATTRIBUTE_STR_LEN;
+ continue;
+ }
+
+ if (ret == type) {
+ ret = kstrdup(type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!ret))
+ return NULL;
+ attr = ret + (attr - type);
+ }
+
+ /* the ATTRIBUTE_STR already has the first '(' */
+ depth = 1;
+ next = attr + ATTRIBUTE_STR_LEN;
+ do {
+ tmp = strpbrk(next, "()");
+ /* There is unbalanced parentheses */
+ if (WARN_ON_ONCE(!tmp)) {
+ kfree(ret);
+ return (char *)type;
+ }
+
+ if (*tmp == '(')
+ depth++;
+ else
+ depth--;
+ next = tmp + 1;
+ } while (depth > 0);
+ next = skip_spaces(next);
+ strcpy(attr, next);
+ next = attr;
+ }
+ return ret;
+}
+
+static char *find_replacable_eval(const char *type, const char *eval_string,
+ int len)
+{
+ char *ptr;
+
+ if (!eval_string)
+ return NULL;
+
+ ptr = strchr(type, '[');
+ if (!ptr)
+ return NULL;
+ ptr++;
+
+ if (!isalpha(*ptr) && *ptr != '_')
+ return NULL;
+
+ if (strncmp(eval_string, ptr, len) != 0)
+ return NULL;
+
+ return ptr;
+}
+
static void update_event_fields(struct trace_event_call *call,
struct trace_eval_map *map)
{
struct ftrace_event_field *field;
+ const char *eval_string = NULL;
struct list_head *head;
+ int len = 0;
char *ptr;
char *str;
- int len = strlen(map->eval_string);
/* Dynamic events should never have field maps */
- if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+ if (call->flags & TRACE_EVENT_FL_DYNAMIC)
return;
+ if (map) {
+ eval_string = map->eval_string;
+ len = strlen(map->eval_string);
+ }
+
head = trace_get_fields(call);
list_for_each_entry(field, head, link) {
- ptr = strchr(field->type, '[');
- if (!ptr)
- continue;
- ptr++;
-
- if (!isalpha(*ptr) && *ptr != '_')
- continue;
+ str = sanitize_field_type(field->type);
+ if (!str)
+ return;
- if (strncmp(map->eval_string, ptr, len) != 0)
- continue;
+ ptr = find_replacable_eval(str, eval_string, len);
+ if (ptr) {
+ if (str == field->type) {
+ str = kstrdup(field->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!str))
+ return;
+ ptr = str + (ptr - field->type);
+ }
- str = kstrdup(field->type, GFP_KERNEL);
- if (WARN_ON_ONCE(!str))
- return;
- ptr = str + (ptr - field->type);
- ptr = eval_replace(ptr, map, len);
- /* enum/sizeof string smaller than value */
- if (WARN_ON_ONCE(!ptr)) {
- kfree(str);
- continue;
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
+ if (WARN_ON_ONCE(!ptr)) {
+ kfree(str);
+ continue;
+ }
}
+ if (str == field->type)
+ continue;
/*
* If the event is part of a module, then we need to free the string
* when the module is removed. Otherwise, it will stay allocated
@@ -3313,14 +3390,18 @@ static void update_event_fields(struct trace_event_call *call,
add_str_to_module(call->module, str);
field->type = str;
+ if (field->filter_type == FILTER_OTHER)
+ field->filter_type = filter_assign_type(field->type);
}
}
-void trace_event_eval_update(struct trace_eval_map **map, int len)
+/* Update all events for replacing eval and sanitizing */
+void trace_event_update_all(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
bool first = false;
+ bool updated;
int last_i;
int i;
@@ -3333,6 +3414,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
last_system = call->class->system;
}
+ updated = false;
/*
* Since calls are grouped by systems, the likelihood that the
* next call in the iteration belongs to the same system as the
@@ -3352,8 +3434,12 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
}
update_event_printk(call, map[i]);
update_event_fields(call, map[i]);
+ updated = true;
}
}
+ /* If not updated yet, update field for sanitizing. */
+ if (!updated)
+ update_event_fields(call, NULL);
cond_resched();
}
up_write(&trace_event_sem);
@@ -3587,7 +3673,7 @@ static int probe_remove_event_call(struct trace_event_call *call)
continue;
/*
* We can't rely on ftrace_event_enable_disable(enable => 0)
- * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress
+ * we are going to do, soft mode can suppress
* TRACE_REG_UNREGISTER.
*/
if (file->flags & EVENT_FILE_FL_ENABLED)
@@ -3698,7 +3784,7 @@ static void trace_module_remove_events(struct module *mod)
if (call->module == mod)
__trace_remove_event_call(call);
}
- /* Check for any strings allocade for this module */
+ /* Check for any strings allocated for this module */
list_for_each_entry_safe(modstr, m, &module_strings, next) {
if (modstr->module != mod)
continue;
@@ -4002,7 +4088,7 @@ static int free_probe_data(void *data)
edata->ref--;
if (!edata->ref) {
- /* Remove the SOFT_MODE flag */
+ /* Remove soft mode */
__ftrace_event_enable_disable(edata->file, 0, 1);
trace_event_put_ref(edata->file->event_call);
kfree(edata);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e4581e10782b..54226b48b2d1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1344,13 +1344,14 @@ struct filter_list {
struct filter_head {
struct list_head list;
- struct rcu_head rcu;
+ union {
+ struct rcu_head rcu;
+ struct rcu_work rwork;
+ };
};
-
-static void free_filter_list(struct rcu_head *rhp)
+static void free_filter_list(struct filter_head *filter_list)
{
- struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
struct filter_list *filter_item, *tmp;
list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) {
@@ -1361,9 +1362,20 @@ static void free_filter_list(struct rcu_head *rhp)
kfree(filter_list);
}
+static void free_filter_list_work(struct work_struct *work)
+{
+ struct filter_head *filter_list;
+
+ filter_list = container_of(to_rcu_work(work), struct filter_head, rwork);
+ free_filter_list(filter_list);
+}
+
static void free_filter_list_tasks(struct rcu_head *rhp)
{
- call_rcu(rhp, free_filter_list);
+ struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
+
+ INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work);
+ queue_rcu_work(system_wq, &filter_list->rwork);
}
/*
@@ -1460,7 +1472,7 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
tracepoint_synchronize_unregister();
if (head)
- free_filter_list(&head->rcu);
+ free_filter_list(head);
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir || !file->filter)
@@ -2305,7 +2317,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
return 0;
fail:
/* No call succeeded */
- free_filter_list(&filter_list->rcu);
+ free_filter_list(filter_list);
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
@@ -2315,7 +2327,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
if (!fail)
delay_free_filter(filter_list);
else
- free_filter_list(&filter_list->rcu);
+ free_filter_list(filter_list);
return -ENOMEM;
}
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index b65353ec2837..2f7b94e98317 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -325,12 +325,9 @@ static void move_to_next_cpu(void)
cpus_read_lock();
cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
- next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
+ next_cpu = cpumask_next_wrap(raw_smp_processor_id(), current_mask);
cpus_read_unlock();
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(current_mask);
-
if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
goto change_mode;
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
new file mode 100644
index 000000000000..eae37bea54fd
--- /dev/null
+++ b/kernel/unwind/Makefile
@@ -0,0 +1 @@
+ obj-$(CONFIG_UNWIND_USER) += user.o deferred.o
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
new file mode 100644
index 000000000000..dc6040aae3ee
--- /dev/null
+++ b/kernel/unwind/deferred.c
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Deferred user space unwinding
+ */
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_deferred.h>
+#include <linux/sched/clock.h>
+#include <linux/task_work.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+/*
+ * For requesting a deferred user space stack trace from NMI context
+ * the architecture must support a safe cmpxchg in NMI context.
+ * For those architectures that do not have that, then it cannot ask
+ * for a deferred user space stack trace from an NMI context. If it
+ * does, then it will get -EINVAL.
+ */
+#if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG)
+# define CAN_USE_IN_NMI 1
+static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
+{
+ u32 old = 0;
+
+ return try_cmpxchg(&info->id.cnt, &old, cnt);
+}
+#else
+# define CAN_USE_IN_NMI 0
+/* When NMIs are not allowed, this always succeeds */
+static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
+{
+ info->id.cnt = cnt;
+ return true;
+}
+#endif
+
+/* Make the cache fit in a 4K page */
+#define UNWIND_MAX_ENTRIES \
+ ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
+
+/* Guards adding to or removing from the list of callbacks */
+static DEFINE_MUTEX(callback_mutex);
+static LIST_HEAD(callbacks);
+
+#define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED)
+
+/* Zero'd bits are available for assigning callback users */
+static unsigned long unwind_mask = RESERVED_BITS;
+DEFINE_STATIC_SRCU(unwind_srcu);
+
+static inline bool unwind_pending(struct unwind_task_info *info)
+{
+ return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
+}
+
+/*
+ * This is a unique percpu identifier for a given task entry context.
+ * Conceptually, it's incremented every time the CPU enters the kernel from
+ * user space, so that each "entry context" on the CPU gets a unique ID. In
+ * reality, as an optimization, it's only incremented on demand for the first
+ * deferred unwind request after a given entry-from-user.
+ *
+ * It's combined with the CPU id to make a systemwide-unique "context cookie".
+ */
+static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
+
+/*
+ * The context cookie is a unique identifier that is assigned to a user
+ * space stacktrace. As the user space stacktrace remains the same while
+ * the task is in the kernel, the cookie is an identifier for the stacktrace.
+ * Although it is possible for the stacktrace to get another cookie if another
+ * request is made after the cookie was cleared and before reentering user
+ * space.
+ */
+static u64 get_cookie(struct unwind_task_info *info)
+{
+ u32 cnt = 1;
+
+ if (info->id.cpu)
+ return info->id.id;
+
+ /* LSB is always set to ensure 0 is an invalid value */
+ cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
+ if (try_assign_cnt(info, cnt)) {
+ /* Update the per cpu counter */
+ __this_cpu_write(unwind_ctx_ctr, cnt);
+ }
+ /* Interrupts are disabled, the CPU will always be same */
+ info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
+
+ return info->id.id;
+}
+
+/**
+ * unwind_user_faultable - Produce a user stacktrace in faultable context
+ * @trace: The descriptor that will store the user stacktrace
+ *
+ * This must be called in a known faultable context (usually when entering
+ * or exiting user space). Depending on the available implementations
+ * the @trace will be loaded with the addresses of the user space stacktrace
+ * if it can be found.
+ *
+ * Return: 0 on success and negative on error
+ * On success @trace will contain the user space stacktrace
+ */
+int unwind_user_faultable(struct unwind_stacktrace *trace)
+{
+ struct unwind_task_info *info = &current->unwind_info;
+ struct unwind_cache *cache;
+
+ /* Should always be called from faultable context */
+ might_fault();
+
+ if (!current->mm)
+ return -EINVAL;
+
+ if (!info->cache) {
+ info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
+ GFP_KERNEL);
+ if (!info->cache)
+ return -ENOMEM;
+ }
+
+ cache = info->cache;
+ trace->entries = cache->entries;
+
+ if (cache->nr_entries) {
+ /*
+ * The user stack has already been previously unwound in this
+ * entry context. Skip the unwind and use the cache.
+ */
+ trace->nr = cache->nr_entries;
+ return 0;
+ }
+
+ trace->nr = 0;
+ unwind_user(trace, UNWIND_MAX_ENTRIES);
+
+ cache->nr_entries = trace->nr;
+
+ /* Clear nr_entries on way back to user space */
+ set_bit(UNWIND_USED_BIT, &info->unwind_mask);
+
+ return 0;
+}
+
+static void process_unwind_deferred(struct task_struct *task)
+{
+ struct unwind_task_info *info = &task->unwind_info;
+ struct unwind_stacktrace trace;
+ struct unwind_work *work;
+ unsigned long bits;
+ u64 cookie;
+
+ if (WARN_ON_ONCE(!unwind_pending(info)))
+ return;
+
+ /* Clear pending bit but make sure to have the current bits */
+ bits = atomic_long_fetch_andnot(UNWIND_PENDING,
+ (atomic_long_t *)&info->unwind_mask);
+ /*
+ * From here on out, the callback must always be called, even if it's
+ * just an empty trace.
+ */
+ trace.nr = 0;
+ trace.entries = NULL;
+
+ unwind_user_faultable(&trace);
+
+ if (info->cache)
+ bits &= ~(info->cache->unwind_completed);
+
+ cookie = info->id.id;
+
+ guard(srcu)(&unwind_srcu);
+ list_for_each_entry_srcu(work, &callbacks, list,
+ srcu_read_lock_held(&unwind_srcu)) {
+ if (test_bit(work->bit, &bits)) {
+ work->func(work, &trace, cookie);
+ if (info->cache)
+ info->cache->unwind_completed |= BIT(work->bit);
+ }
+ }
+}
+
+static void unwind_deferred_task_work(struct callback_head *head)
+{
+ process_unwind_deferred(current);
+}
+
+void unwind_deferred_task_exit(struct task_struct *task)
+{
+ struct unwind_task_info *info = &current->unwind_info;
+
+ if (!unwind_pending(info))
+ return;
+
+ process_unwind_deferred(task);
+
+ task_work_cancel(task, &info->work);
+}
+
+/**
+ * unwind_deferred_request - Request a user stacktrace on task kernel exit
+ * @work: Unwind descriptor requesting the trace
+ * @cookie: The cookie of the first request made for this task
+ *
+ * Schedule a user space unwind to be done in task work before exiting the
+ * kernel.
+ *
+ * The returned @cookie output is the generated cookie of the very first
+ * request for a user space stacktrace for this task since it entered the
+ * kernel. It can be from a request by any caller of this infrastructure.
+ * Its value will also be passed to the callback function. It can be
+ * used to stitch kernel and user stack traces together in post-processing.
+ *
+ * It's valid to call this function multiple times for the same @work within
+ * the same task entry context. Each call will return the same cookie
+ * while the task hasn't left the kernel. If the callback is not pending
+ * because it has already been previously called for the same entry context,
+ * it will be called again with the same stack trace and cookie.
+ *
+ * Return: 0 if the callback successfully was queued.
+ * 1 if the callback is pending or was already executed.
+ * Negative if there's an error.
+ * @cookie holds the cookie of the first request by any user
+ */
+int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
+{
+ struct unwind_task_info *info = &current->unwind_info;
+ unsigned long old, bits;
+ unsigned long bit;
+ int ret;
+
+ *cookie = 0;
+
+ if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
+ !user_mode(task_pt_regs(current)))
+ return -EINVAL;
+
+ /*
+ * NMI requires having safe cmpxchg operations.
+ * Trigger a warning to make it obvious that an architecture
+ * is using this in NMI when it should not be.
+ */
+ if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi()))
+ return -EINVAL;
+
+ /* Do not allow cancelled works to request again */
+ bit = READ_ONCE(work->bit);
+ if (WARN_ON_ONCE(bit < 0))
+ return -EINVAL;
+
+ /* Only need the mask now */
+ bit = BIT(bit);
+
+ guard(irqsave)();
+
+ *cookie = get_cookie(info);
+
+ old = READ_ONCE(info->unwind_mask);
+
+ /* Is this already queued or executed */
+ if (old & bit)
+ return 1;
+
+ /*
+ * This work's bit hasn't been set yet. Now set it with the PENDING
+ * bit and fetch the current value of unwind_mask. If ether the
+ * work's bit or PENDING was already set, then this is already queued
+ * to have a callback.
+ */
+ bits = UNWIND_PENDING | bit;
+ old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
+ if (old & bits) {
+ /*
+ * If the work's bit was set, whatever set it had better
+ * have also set pending and queued a callback.
+ */
+ WARN_ON_ONCE(!(old & UNWIND_PENDING));
+ return old & bit;
+ }
+
+ /* The work has been claimed, now schedule it. */
+ ret = task_work_add(current, &info->work, TWA_RESUME);
+
+ if (WARN_ON_ONCE(ret))
+ WRITE_ONCE(info->unwind_mask, 0);
+
+ return ret;
+}
+
+void unwind_deferred_cancel(struct unwind_work *work)
+{
+ struct task_struct *g, *t;
+ int bit;
+
+ if (!work)
+ return;
+
+ bit = work->bit;
+
+ /* No work should be using a reserved bit */
+ if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS))
+ return;
+
+ guard(mutex)(&callback_mutex);
+ list_del_rcu(&work->list);
+
+ /* Do not allow any more requests and prevent callbacks */
+ work->bit = -1;
+
+ __clear_bit(bit, &unwind_mask);
+
+ synchronize_srcu(&unwind_srcu);
+
+ guard(rcu)();
+ /* Clear this bit from all threads */
+ for_each_process_thread(g, t) {
+ clear_bit(bit, &t->unwind_info.unwind_mask);
+ if (t->unwind_info.cache)
+ clear_bit(bit, &t->unwind_info.cache->unwind_completed);
+ }
+}
+
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
+{
+ memset(work, 0, sizeof(*work));
+
+ guard(mutex)(&callback_mutex);
+
+ /* See if there's a bit in the mask available */
+ if (unwind_mask == ~0UL)
+ return -EBUSY;
+
+ work->bit = ffz(unwind_mask);
+ __set_bit(work->bit, &unwind_mask);
+
+ list_add_rcu(&work->list, &callbacks);
+ work->func = func;
+ return 0;
+}
+
+void unwind_task_init(struct task_struct *task)
+{
+ struct unwind_task_info *info = &task->unwind_info;
+
+ memset(info, 0, sizeof(*info));
+ init_task_work(&info->work, unwind_deferred_task_work);
+ info->unwind_mask = 0;
+}
+
+void unwind_task_free(struct task_struct *task)
+{
+ struct unwind_task_info *info = &task->unwind_info;
+
+ kfree(info->cache);
+ task_work_cancel(task, &info->work);
+}
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
new file mode 100644
index 000000000000..97a8415e3216
--- /dev/null
+++ b/kernel/unwind/user.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+* Generic interfaces for unwinding user space
+*/
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_user.h>
+#include <linux/uaccess.h>
+
+static const struct unwind_user_frame fp_frame = {
+ ARCH_INIT_USER_FP_FRAME
+};
+
+#define for_each_user_frame(state) \
+ for (unwind_user_start(state); !(state)->done; unwind_user_next(state))
+
+static int unwind_user_next_fp(struct unwind_user_state *state)
+{
+ const struct unwind_user_frame *frame = &fp_frame;
+ unsigned long cfa, fp, ra;
+ unsigned int shift;
+
+ if (frame->use_fp) {
+ if (state->fp < state->sp)
+ return -EINVAL;
+ cfa = state->fp;
+ } else {
+ cfa = state->sp;
+ }
+
+ /* Get the Canonical Frame Address (CFA) */
+ cfa += frame->cfa_off;
+
+ /* stack going in wrong direction? */
+ if (cfa <= state->sp)
+ return -EINVAL;
+
+ /* Make sure that the address is word aligned */
+ shift = sizeof(long) == 4 ? 2 : 3;
+ if (cfa & ((1 << shift) - 1))
+ return -EINVAL;
+
+ /* Find the Return Address (RA) */
+ if (get_user(ra, (unsigned long *)(cfa + frame->ra_off)))
+ return -EINVAL;
+
+ if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off)))
+ return -EINVAL;
+
+ state->ip = ra;
+ state->sp = cfa;
+ if (frame->fp_off)
+ state->fp = fp;
+ return 0;
+}
+
+static int unwind_user_next(struct unwind_user_state *state)
+{
+ unsigned long iter_mask = state->available_types;
+ unsigned int bit;
+
+ if (state->done)
+ return -EINVAL;
+
+ for_each_set_bit(bit, &iter_mask, NR_UNWIND_USER_TYPE_BITS) {
+ enum unwind_user_type type = BIT(bit);
+
+ state->current_type = type;
+ switch (type) {
+ case UNWIND_USER_TYPE_FP:
+ if (!unwind_user_next_fp(state))
+ return 0;
+ continue;
+ default:
+ WARN_ONCE(1, "Undefined unwind bit %d", bit);
+ break;
+ }
+ break;
+ }
+
+ /* No successful unwind method. */
+ state->current_type = UNWIND_USER_TYPE_NONE;
+ state->done = true;
+ return -EINVAL;
+}
+
+static int unwind_user_start(struct unwind_user_state *state)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+
+ memset(state, 0, sizeof(*state));
+
+ if ((current->flags & PF_KTHREAD) || !user_mode(regs)) {
+ state->done = true;
+ return -EINVAL;
+ }
+
+ if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
+ state->available_types |= UNWIND_USER_TYPE_FP;
+
+ state->ip = instruction_pointer(regs);
+ state->sp = user_stack_pointer(regs);
+ state->fp = frame_pointer(regs);
+
+ return 0;
+}
+
+int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries)
+{
+ struct unwind_user_state state;
+
+ trace->nr = 0;
+
+ if (!max_entries)
+ return -EINVAL;
+
+ if (current->flags & PF_KTHREAD)
+ return 0;
+
+ for_each_user_frame(&state) {
+ trace->entries[trace->nr++] = state.ip;
+ if (trace->nr >= max_entries)
+ break;
+ }
+
+ return 0;
+}
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index 2f844c279a3e..bc738fa90c1d 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *),
tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
if (IS_ERR(tsk)) {
kfree(vtsk);
- return ERR_PTR(PTR_ERR(tsk));
+ return ERR_CAST(tsk);
}
vtsk->task = tsk;
diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c
index 34dbfe091f4b..ee754d767c21 100644
--- a/kernel/watchdog_buddy.c
+++ b/kernel/watchdog_buddy.c
@@ -12,10 +12,7 @@ static unsigned int watchdog_next_cpu(unsigned int cpu)
{
unsigned int next_cpu;
- next_cpu = cpumask_next(cpu, &watchdog_cpus);
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(&watchdog_cpus);
-
+ next_cpu = cpumask_next_wrap(cpu, &watchdog_cpus);
if (next_cpu == cpu)
return nr_cpu_ids;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9f9148075828..c6b79b3675c3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -505,12 +505,16 @@ static struct kthread_worker *pwq_release_worker __ro_after_init;
struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
+struct workqueue_struct *system_percpu_wq __ro_after_init;
+EXPORT_SYMBOL(system_percpu_wq);
struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
+struct workqueue_struct *system_dfl_wq __ro_after_init;
+EXPORT_SYMBOL_GPL(system_dfl_wq);
struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __ro_after_init;
@@ -1686,17 +1690,14 @@ static void __pwq_activate_work(struct pool_workqueue *pwq,
static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
{
int max = READ_ONCE(nna->max);
+ int old = atomic_read(&nna->nr);
- while (true) {
- int old, tmp;
-
- old = atomic_read(&nna->nr);
+ do {
if (old >= max)
return false;
- tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
- if (tmp == old)
- return true;
- }
+ } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1));
+
+ return true;
}
/**
@@ -2221,12 +2222,9 @@ static int wq_select_unbound_cpu(int cpu)
}
new_cpu = __this_cpu_read(wq_rr_cpu_last);
- new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
- if (unlikely(new_cpu >= nr_cpu_ids)) {
- new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
- if (unlikely(new_cpu >= nr_cpu_ids))
- return cpu;
- }
+ new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+ if (unlikely(new_cpu >= nr_cpu_ids))
+ return cpu;
__this_cpu_write(wq_rr_cpu_last, new_cpu);
return new_cpu;
@@ -4629,7 +4627,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
*
* Return: The allocated new workqueue_attr on success. %NULL on failure.
*/
-struct workqueue_attrs *alloc_workqueue_attrs(void)
+struct workqueue_attrs *alloc_workqueue_attrs_noprof(void)
{
struct workqueue_attrs *attrs;
@@ -5682,12 +5680,12 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt,
else
wq_size = sizeof(*wq);
- wq = kzalloc(wq_size, GFP_KERNEL);
+ wq = kzalloc_noprof(wq_size, GFP_KERNEL);
if (!wq)
return NULL;
if (flags & WQ_UNBOUND) {
- wq->unbound_attrs = alloc_workqueue_attrs();
+ wq->unbound_attrs = alloc_workqueue_attrs_noprof();
if (!wq->unbound_attrs)
goto err_free_wq;
}
@@ -5777,9 +5775,9 @@ err_destroy:
}
__printf(1, 4)
-struct workqueue_struct *alloc_workqueue(const char *fmt,
- unsigned int flags,
- int max_active, ...)
+struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
+ unsigned int flags,
+ int max_active, ...)
{
struct workqueue_struct *wq;
va_list args;
@@ -5794,7 +5792,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
return wq;
}
-EXPORT_SYMBOL_GPL(alloc_workqueue);
+EXPORT_SYMBOL_GPL(alloc_workqueue_noprof);
#ifdef CONFIG_LOCKDEP
__printf(1, 5)
@@ -6770,31 +6768,6 @@ long work_on_cpu_key(int cpu, long (*fn)(void *),
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_key);
-
-/**
- * work_on_cpu_safe_key - run a function in thread context on a particular cpu
- * @cpu: the cpu to run on
- * @fn: the function to run
- * @arg: the function argument
- * @key: The lock class key for lock debugging purposes
- *
- * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
- * any locks which would prevent @fn from completing.
- *
- * Return: The value @fn returns.
- */
-long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
- void *arg, struct lock_class_key *key)
-{
- long ret = -ENODEV;
-
- cpus_read_lock();
- if (cpu_online(cpu))
- ret = work_on_cpu_key(cpu, fn, arg, key);
- cpus_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER
@@ -7830,10 +7803,11 @@ void __init workqueue_init_early(void)
}
system_wq = alloc_workqueue("events", 0, 0);
+ system_percpu_wq = alloc_workqueue("events", 0, 0);
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
- system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
- WQ_MAX_ACTIVE);
+ system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
+ system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
@@ -7844,8 +7818,8 @@ void __init workqueue_init_early(void)
system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
WQ_BH | WQ_HIGHPRI, 0);
- BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
- !system_unbound_wq || !system_freezable_wq ||
+ BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
+ !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq ||
!system_bh_wq || !system_bh_highpri_wq);