summaryrefslogtreecommitdiff
path: root/kernel/cgroup
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/cgroup-v1.c14
-rw-r--r--kernel/cgroup/cgroup.c5
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--kernel/cgroup/legacy_freezer.c8
-rw-r--r--kernel/cgroup/rstat.c195
5 files changed, 110 insertions, 114 deletions
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index fa24c032ed6f..2a4a387f867a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -32,6 +32,9 @@ static u16 cgroup_no_v1_mask;
/* disable named v1 mounts */
static bool cgroup_no_v1_named;
+/* Show unavailable controllers in /proc/cgroups */
+static bool proc_show_all;
+
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
@@ -683,10 +686,11 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
*/
for_each_subsys(ss, i) {
- if (cgroup1_subsys_absent(ss))
- continue;
cgrp_v1_visible |= ss->root != &cgrp_dfl_root;
+ if (!proc_show_all && cgroup1_subsys_absent(ss))
+ continue;
+
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps),
@@ -1359,3 +1363,9 @@ static int __init cgroup_no_v1(char *str)
return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);
+
+static int __init cgroup_v1_proc(char *str)
+{
+ return (kstrtobool(str, &proc_show_all) == 0);
+}
+__setup("cgroup_v1_proc=", cgroup_v1_proc);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a723b7dc6e4e..312c6a8b55bb 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2074,6 +2074,11 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+#ifdef CONFIG_CGROUP_BPF
+ for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++)
+ cgrp->bpf.revisions[i] = 1;
+#endif
+
init_waitqueue_head(&cgrp->offline_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 3bc4301466f3..f74d04429a29 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4051,7 +4051,7 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
+ hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 507b8f19a262..dd9417425d92 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -66,15 +66,9 @@ static struct freezer *parent_freezer(struct freezer *freezer)
bool cgroup_freezing(struct task_struct *task)
{
bool ret;
- unsigned int state;
rcu_read_lock();
- /* Check if the cgroup is still FREEZING, but not FROZEN. The extra
- * !FROZEN check is required, because the FREEZING bit is not cleared
- * when the state FROZEN is reached.
- */
- state = task_freezer(task)->state;
- ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN);
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
rcu_read_unlock();
return ret;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index cbeaa499a96a..981e2f77ad4e 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -10,7 +10,7 @@
#include <trace/events/cgroup.h>
static DEFINE_SPINLOCK(rstat_base_lock);
-static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -45,84 +45,11 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
return &rstat_base_lock;
}
-static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
+static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
{
- if (ss) {
- /*
- * Depending on config, the subsystem per-cpu lock type may be an
- * empty struct. In enviromnents where this is the case, allocation
- * of this field is not performed in ss_rstat_init(). Avoid a
- * cpu-based offset relative to NULL by returning early. When the
- * lock type is zero in size, the corresponding lock functions are
- * no-ops so passing them NULL is acceptable.
- */
- if (sizeof(*ss->rstat_ss_cpu_lock) == 0)
- return NULL;
-
- return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu);
- }
-
- return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
-}
-
-/*
- * Helper functions for rstat per CPU locks.
- *
- * This makes it easier to diagnose locking issues and contention in
- * production environments. The parameter @fast_path determine the
- * tracepoints being added, allowing us to diagnose "flush" related
- * operations without handling high-frequency fast-path "update" events.
- */
-static __always_inline
-unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
- const bool fast_path)
-{
- struct cgroup *cgrp = css->cgroup;
- raw_spinlock_t *cpu_lock;
- unsigned long flags;
- bool contended;
-
- /*
- * The _irqsave() is needed because the locks used for flushing are
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
- * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
- * kernel. The raw_spinlock_t below disables interrupts on both
- * configurations. The _irqsave() ensures that interrupts are always
- * disabled and later restored.
- */
- cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
- contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
- if (contended) {
- if (fast_path)
- trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
- else
- trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
-
- raw_spin_lock_irqsave(cpu_lock, flags);
- }
-
- if (fast_path)
- trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
- else
- trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
-
- return flags;
-}
-
-static __always_inline
-void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
- unsigned long flags, const bool fast_path)
-{
- struct cgroup *cgrp = css->cgroup;
- raw_spinlock_t *cpu_lock;
-
- if (fast_path)
- trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
- else
- trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
-
- cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
- raw_spin_unlock_irqrestore(cpu_lock, flags);
+ if (ss)
+ return per_cpu_ptr(ss->lhead, cpu);
+ return per_cpu_ptr(&rstat_backlog_list, cpu);
}
/**
@@ -130,13 +57,22 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
* @css: target cgroup subsystem state
* @cpu: cpu on which rstat_cpu was updated
*
- * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * css_rstat_cpu definition for details.
+ * Atomically inserts the css in the ss's llist for the given cpu. This is
+ * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist
+ * will be processed at the flush time to create the update tree.
+ *
+ * NOTE: if the user needs the guarantee that the updater either add itself in
+ * the lockless list or the concurrent flusher flushes its updated stats, a
+ * memory barrier is needed before the call to css_rstat_updated() i.e. a
+ * barrier after updating the per-cpu stats and before calling
+ * css_rstat_updated().
*/
__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
- unsigned long flags;
+ struct llist_head *lhead;
+ struct css_rstat_cpu *rstatc;
+ struct css_rstat_cpu __percpu *rstatc_pcpu;
+ struct llist_node *self;
/*
* Since bpf programs can call this function, prevent access to
@@ -145,19 +81,49 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
if (!css_uses_rstat(css))
return;
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * For archs withnot nmi safe cmpxchg or percpu ops support, ignore
+ * the requests from nmi context.
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi())
+ return;
+
+ rstatc = css_rstat_cpu(css, cpu);
/*
- * Speculative already-on-list test. This may race leading to
- * temporary inaccuracies, which is fine.
+ * If already on list return. This check is racy and smp_mb() is needed
+ * to pair it with the smp_mb() in css_process_update_tree() if the
+ * guarantee that the updated stats are visible to concurrent flusher is
+ * needed.
+ */
+ if (llist_on_list(&rstatc->lnode))
+ return;
+
+ /*
+ * This function can be renentered by irqs and nmis for the same cgroup
+ * and may try to insert the same per-cpu lnode into the llist. Note
+ * that llist_add() does not protect against such scenarios.
*
- * Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @css is on the list by
- * testing the next pointer for NULL.
+ * To protect against such stacked contexts of irqs/nmis, we use the
+ * fact that lnode points to itself when not on a list and then use
+ * this_cpu_cmpxchg() to atomically set to NULL to select the winner
+ * which will call llist_add(). The losers can assume the insertion is
+ * successful and the winner will eventually add the per-cpu lnode to
+ * the llist.
*/
- if (data_race(css_rstat_cpu(css, cpu)->updated_next))
+ self = &rstatc->lnode;
+ rstatc_pcpu = css->rstat_cpu;
+ if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self)
return;
- flags = _css_rstat_cpu_lock(css, cpu, true);
+ lhead = ss_lhead_cpu(css->ss, cpu);
+ llist_add(&rstatc->lnode, lhead);
+}
+static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu)
+{
/* put @css and all ancestors on the corresponding updated lists */
while (true) {
struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
@@ -183,8 +149,34 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
css = parent;
}
+}
+
+static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
+{
+ struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
+ struct llist_node *lnode;
+
+ while ((lnode = llist_del_first_init(lhead))) {
+ struct css_rstat_cpu *rstatc;
+
+ /*
+ * smp_mb() is needed here (more specifically in between
+ * init_llist_node() and per-cpu stats flushing) if the
+ * guarantee is required by a rstat user where etiher the
+ * updater should add itself on the lockless list or the
+ * flusher flush the stats updated by the updater who have
+ * observed that they are already on the list. The
+ * corresponding barrier pair for this one should be before
+ * css_rstat_updated() by the user.
+ *
+ * For now, there aren't any such user, so not adding the
+ * barrier here but if such a use-case arise, please add
+ * smp_mb() here.
+ */
- _css_rstat_cpu_unlock(css, cpu, flags, true);
+ rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
+ __css_process_update_tree(rstatc->owner, cpu);
+ }
}
/**
@@ -288,13 +280,12 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
{
struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
struct cgroup_subsys_state *head = NULL, *parent, *child;
- unsigned long flags;
- flags = _css_rstat_cpu_lock(root, cpu, false);
+ css_process_update_tree(root->ss, cpu);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
- goto unlock_ret;
+ return NULL;
/*
* Unlink @root from its parent. As the updated_children list is
@@ -326,8 +317,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
rstatc->updated_children = root;
if (child != root)
head = css_rstat_push_children(head, child, cpu);
-unlock_ret:
- _css_rstat_cpu_unlock(root, cpu, flags, false);
+
return head;
}
@@ -468,7 +458,8 @@ int css_rstat_init(struct cgroup_subsys_state *css)
for_each_possible_cpu(cpu) {
struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
- rstatc->updated_children = css;
+ rstatc->owner = rstatc->updated_children = css;
+ init_llist_node(&rstatc->lnode);
if (is_self) {
struct cgroup_rstat_base_cpu *rstatbc;
@@ -522,19 +513,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
{
int cpu;
- /*
- * Depending on config, the subsystem per-cpu lock type may be an empty
- * struct. Avoid allocating a size of zero in this case.
- */
- if (ss && sizeof(*ss->rstat_ss_cpu_lock)) {
- ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t);
- if (!ss->rstat_ss_cpu_lock)
+ if (ss) {
+ ss->lhead = alloc_percpu(struct llist_head);
+ if (!ss->lhead)
return -ENOMEM;
}
spin_lock_init(ss_rstat_lock(ss));
for_each_possible_cpu(cpu)
- raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu));
+ init_llist_head(ss_lhead_cpu(ss, cpu));
return 0;
}