diff options
Diffstat (limited to 'kernel/cgroup/cgroup.c')
-rw-r--r-- | kernel/cgroup/cgroup.c | 1799 |
1 files changed, 1185 insertions, 614 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index dd247747ec14..afc665b7b1fe 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -30,6 +30,7 @@ #include "cgroup-internal.h" +#include <linux/bpf-cgroup.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/init_task.h> @@ -56,6 +57,7 @@ #include <linux/file.h> #include <linux/fs_parser.h> #include <linux/sched/cputime.h> +#include <linux/sched/deadline.h> #include <linux/psi.h> #include <net/sock.h> @@ -68,6 +70,14 @@ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* + * To avoid confusing the compiler (and generating warnings) with code + * that attempts to access what would be a 0-element array (i.e. sized + * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this + * constant expression can be added. + */ +#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) + +/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * @@ -87,7 +97,7 @@ EXPORT_SYMBOL_GPL(css_set_lock); DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; -bool cgroup_debug __read_mostly; +static bool cgroup_debug __read_mostly; /* * Protects cgroup_idr and css_idr so that IDs can be released without @@ -197,9 +207,11 @@ static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; +static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .count = REFCOUNT_INIT(2), + .ns.count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, @@ -208,6 +220,23 @@ struct cgroup_namespace init_cgroup_ns = { static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; +static struct cftype cgroup_psi_files[]; + +/* cgroup optional features */ +enum cgroup_opt_features { +#ifdef CONFIG_PSI + OPT_FEATURE_PRESSURE, +#endif + OPT_FEATURE_COUNT +}; + +static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = { +#ifdef CONFIG_PSI + "pressure", +#endif +}; + +static u16 cgroup_feature_disable_mask __read_mostly; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); @@ -222,6 +251,12 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +#ifdef CONFIG_DEBUG_CGROUP_REF +#define CGROUP_REF_FN_ATTRS noinline +#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn); +#include <linux/cgroup_refcnt.h> +#endif + /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest @@ -232,7 +267,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, */ bool cgroup_ssid_enabled(int ssid) { - if (CGROUP_SUBSYS_COUNT == 0) + if (!CGROUP_HAS_SUBSYS_CONFIG) return false; return static_key_enabled(cgroup_subsys_enabled_key[ssid]); @@ -244,7 +279,7 @@ bool cgroup_ssid_enabled(int ssid) * * The default hierarchy is the v2 interface of cgroup and this function * can be used to test whether a cgroup is on the default hierarchy for - * cases where a subsystem should behave differnetly depending on the + * cases where a subsystem should behave differently depending on the * interface version. * * List of changed behaviors: @@ -254,15 +289,13 @@ bool cgroup_ssid_enabled(int ssid) * * - When mounting an existing superblock, mount options should match. * - * - Remount is disallowed. - * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use * "cgroup.procs" instead. * * - "cgroup.procs" is not sorted. pids will be unique unless they got - * recycled inbetween reads. + * recycled in-between reads. * * - "release_agent" and "notify_on_release" are removed. Replacement * notification mechanism will be implemented. @@ -281,12 +314,7 @@ bool cgroup_ssid_enabled(int ssid) * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * - * - memcg: use_hierarchy is on by default and the cgroup file for the flag - * is not created. - * * - blkcg: blk-throttle becomes properly hierarchical. - * - * - debug: disallowed on the default hierarchy. */ bool cgroup_on_dfl(const struct cgroup *cgrp) { @@ -329,7 +357,7 @@ static bool cgroup_has_tasks(struct cgroup *cgrp) return cgrp->nr_populated_csets; } -bool cgroup_is_threaded(struct cgroup *cgrp) +static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } @@ -345,7 +373,7 @@ static bool cgroup_is_mixable(struct cgroup *cgrp) return !cgroup_parent(cgrp); } -/* can @cgrp become a thread root? should always be true for a thread root */ +/* can @cgrp become a thread root? Should always be true for a thread root */ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) { /* mixables don't care */ @@ -368,7 +396,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) } /* is @cgrp root of a threaded subtree? */ -bool cgroup_is_thread_root(struct cgroup *cgrp) +static bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) @@ -459,7 +487,7 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp) static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { - if (ss) + if (CGROUP_HAS_SUBSYS_CONFIG && ss) return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else @@ -467,28 +495,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, } /** - * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest - * - * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist - * or is offline, %NULL is returned. - */ -static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - css = cgroup_css(cgrp, ss); - if (css && !css_tryget_online(css)) - css = NULL; - rcu_read_unlock(); - - return css; -} - -/** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) @@ -530,13 +536,16 @@ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, * the root css is returned, so this function always returns a valid css. * * The returned css is not guaranteed to be online, and therefore it is the - * callers responsiblity to tryget a reference for it. + * callers responsibility to try get a reference for it. */ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return NULL; + do { css = cgroup_css(cgrp, ss); @@ -564,6 +573,9 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, { struct cgroup_subsys_state *css; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return NULL; + rcu_read_lock(); do { @@ -580,11 +592,12 @@ out_unlock: rcu_read_unlock(); return css; } +EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); - css_get(&cgrp->self); + cgroup_get(cgrp); } /** @@ -633,7 +646,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete. */ - if (cft->ss) + if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self; @@ -646,7 +659,7 @@ EXPORT_SYMBOL_GPL(of_css); * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_[tree_]mutex. + * Should be called under cgroup_mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ @@ -656,21 +669,6 @@ EXPORT_SYMBOL_GPL(of_css); else /** - * for_each_e_css - iterate all effective css's of a cgroup - * @css: the iteration cursor - * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end - * @cgrp: the target cgroup to iterate css's of - * - * Should be called under cgroup_[tree_]mutex. - */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css_by_mask(cgrp, \ - cgroup_subsys[(ssid)]))) \ - ; \ - else - -/** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end @@ -681,7 +679,7 @@ EXPORT_SYMBOL_GPL(of_css); */ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsigned long __ss_mask = (ss_mask); \ - if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ + if (!CGROUP_HAS_SUBSYS_CONFIG) { \ (ssid) = 0; \ break; \ } \ @@ -702,7 +700,7 @@ EXPORT_SYMBOL_GPL(of_css); ; \ else -/* walk live descendants in preorder */ +/* walk live descendants in pre order */ #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ @@ -736,7 +734,8 @@ struct css_set init_css_set = { .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), - .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), + .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), + .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* @@ -910,7 +909,7 @@ static void css_set_move_task(struct task_struct *task, #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state **css) { unsigned long key = 0UL; struct cgroup_subsys *ss; @@ -936,7 +935,7 @@ void put_css_set_locked(struct css_set *cset) WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); - /* This css_set is dead. unlink it and release cgroup and css refs */ + /* This css_set is dead. Unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); css_put(cset->subsys[ssid]); @@ -1051,7 +1050,7 @@ static bool compare_css_sets(struct css_set *cset, */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, - struct cgroup_subsys_state *template[]) + struct cgroup_subsys_state **template) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; @@ -1061,7 +1060,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* * Build the set of subsystem state objects that we want to see in the - * new css_set. while subsystems can change globally, the entries here + * new css_set. While subsystems can change globally, the entries here * won't change, so no need for locking. */ for_each_subsys(ss, i) { @@ -1151,7 +1150,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, /* * Always add links to the tail of the lists so that the lists are - * in choronological order. + * in chronological order. */ list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); @@ -1211,7 +1210,8 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); - INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_src_preload_node); + INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in @@ -1273,11 +1273,25 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { - struct cgroup *root_cgrp = kf_root->kn->priv; + struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv; return root_cgrp->root; } +void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) +{ + bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; + + /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + if (favor && !favoring) { + rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); + root->flags |= CGRP_ROOT_FAVOR_DYNMODS; + } else if (!favor && favoring) { + rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); + root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; + } +} + static int cgroup_init_root_id(struct cgroup_root *root) { int id; @@ -1301,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - kfree(root); + kfree_rcu(root, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1333,89 +1347,130 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; - } + WARN_ON_ONCE(list_empty(&root->root_list)); + list_del_rcu(&root->root_list); + cgroup_root_count--; + + if (!have_favordynmods) + cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); + cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } /* - * look up cgroup associated with current task's cgroup namespace on the - * specified hierarchy + * Returned cgroup is without refcount but it's valid as long as cset pins it. */ -static struct cgroup * -current_cgns_cgroup_from_root(struct cgroup_root *root) +static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) { - struct cgroup *res = NULL; - struct css_set *cset; + struct cgroup *res_cgroup = NULL; - lockdep_assert_held(&css_set_lock); - - rcu_read_lock(); - - cset = current->nsproxy->cgroup_ns->root_cset; if (cset == &init_css_set) { - res = &root->cgrp; + res_cgroup = &root->cgrp; } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; + res_cgroup = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; + lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == root) { - res = c; + res_cgroup = c; break; } } } - rcu_read_unlock(); - BUG_ON(!res); - return res; + /* + * If cgroup_mutex is not held, the cgrp_cset_link will be freed + * before we remove the cgroup root from the root_list. Consequently, + * when accessing a cgroup root, the cset_link may have already been + * freed, resulting in a NULL res_cgroup. However, by holding the + * cgroup_mutex, we ensure that res_cgroup can't be NULL. + * If we don't hold cgroup_mutex in the caller, we must do the NULL + * check. + */ + return res_cgroup; } -/* look up cgroup associated with given css_set on the specified hierarchy */ -static struct cgroup *cset_cgroup_from_root(struct css_set *cset, - struct cgroup_root *root) +/* + * look up cgroup associated with current task's cgroup namespace on the + * specified hierarchy + */ +static struct cgroup * +current_cgns_cgroup_from_root(struct cgroup_root *root) { struct cgroup *res = NULL; + struct css_set *cset; - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); - if (cset == &init_css_set) { - res = &root->cgrp; - } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; - } else { - struct cgrp_cset_link *link; + rcu_read_lock(); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; + cset = current->nsproxy->cgroup_ns->root_cset; + res = __cset_cgroup_from_root(cset, root); - if (c->root == root) { - res = c; - break; - } - } - } + rcu_read_unlock(); - BUG_ON(!res); + /* + * The namespace_sem is held by current, so the root cgroup can't + * be umounted. Therefore, we can ensure that the res is non-NULL. + */ + WARN_ON_ONCE(!res); return res; } /* + * Look up cgroup associated with current task's cgroup namespace on the default + * hierarchy. + * + * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: + * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu + * pointers. + * - css_set_lock is not needed because we just read cset->dfl_cgrp. + * - As a bonus returned cgrp is pinned with the current because it cannot + * switch cgroup_ns asynchronously. + */ +static struct cgroup *current_cgns_cgroup_dfl(void) +{ + struct css_set *cset; + + if (current->nsproxy) { + cset = current->nsproxy->cgroup_ns->root_cset; + return __cset_cgroup_from_root(cset, &cgrp_dfl_root); + } else { + /* + * NOTE: This function may be called from bpf_cgroup_from_id() + * on a task which has already passed exit_task_namespaces() and + * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all + * cgroups visible for lookups. + */ + return &cgrp_dfl_root.cgrp; + } +} + +/* look up cgroup associated with given css_set on the specified hierarchy */ +static struct cgroup *cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) +{ + lockdep_assert_held(&css_set_lock); + + return __cset_cgroup_from_root(cset, root); +} + +/* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_lock held. + * called with css_set_lock held to prevent task's groups from being modified. + * Must be called with either cgroup_mutex or rcu read lock to prevent the + * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -1559,7 +1614,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn) else cgrp = kn->parent->priv; - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); @@ -1604,7 +1659,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) if (drain_offline) cgroup_lock_and_drain_offline(cgrp); else - mutex_lock(&cgroup_mutex); + cgroup_lock(); if (!cgroup_is_dead(cgrp)) return cgrp; @@ -1635,7 +1690,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) /** * css_clear_dir - remove subsys files in a cgroup directory - * @css: taget css + * @css: target css */ static void css_clear_dir(struct cgroup_subsys_state *css) { @@ -1648,12 +1703,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; if (!css->ss) { - if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; - else - cfts = cgroup1_base_files; - - cgroup_addrm_files(css, cgrp, cfts, false); + if (cgroup_on_dfl(cgrp)) { + cgroup_addrm_files(css, cgrp, + cgroup_base_files, false); + if (cgroup_psi_enabled()) + cgroup_addrm_files(css, cgrp, + cgroup_psi_files, false); + } else { + cgroup_addrm_files(css, cgrp, + cgroup1_base_files, false); + } } else { list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); @@ -1672,18 +1731,31 @@ static int css_populate_dir(struct cgroup_subsys_state *css) struct cftype *cfts, *failed_cfts; int ret; - if ((css->flags & CSS_VISIBLE) || !cgrp->kn) + if (css->flags & CSS_VISIBLE) return 0; if (!css->ss) { - if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; - else - cfts = cgroup1_base_files; - - ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); - if (ret < 0) - return ret; + if (cgroup_on_dfl(cgrp)) { + ret = cgroup_addrm_files(css, cgrp, + cgroup_base_files, true); + if (ret < 0) + return ret; + + if (cgroup_psi_enabled()) { + ret = cgroup_addrm_files(css, cgrp, + cgroup_psi_files, true); + if (ret < 0) { + cgroup_addrm_files(css, cgrp, + cgroup_base_files, false); + return ret; + } + } + } else { + ret = cgroup_addrm_files(css, cgrp, + cgroup1_base_files, true); + if (ret < 0) + return ret; + } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { ret = cgroup_addrm_files(css, cgrp, cfts, true); @@ -1710,7 +1782,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - int ssid, i, ret; + int ssid, ret; + u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1727,33 +1800,76 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; + + /* + * Collect ssid's that need to be disabled from default + * hierarchy. + */ + if (ss->root == &cgrp_dfl_root) + dfl_disable_ss_mask |= 1 << ssid; + } while_each_subsys_mask(); + if (dfl_disable_ss_mask) { + struct cgroup *scgrp = &cgrp_dfl_root.cgrp; + + /* + * Controllers from default hierarchy that need to be rebound + * are all disabled together in one go. + */ + cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); - struct css_set *cset; + struct css_set *cset, *cset_pos; + struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); - /* disable from the source */ - src_root->subsys_mask &= ~(1 << ssid); - WARN_ON(cgroup_apply_control(scgrp)); - cgroup_finalize_control(scgrp, 0); + if (src_root != &cgrp_dfl_root) { + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; - css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) + css->cgroup = dcgrp; + WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); + list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], + e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); + /* + * all css_sets of scgrp together in same order to dcgrp, + * patch in-flight iterators to preserve correct iteration. + * since the iterator is always advanced right away and + * finished when it->cset_pos meets it->cset_head, so only + * update it->cset_head is enough here. + */ + list_for_each_entry(it, &cset->task_iters, iters_node) + if (it->cset_head == &scgrp->e_csets[ss->id]) + it->cset_head = &dcgrp->e_csets[ss->id]; + } spin_unlock_irq(&css_set_lock); + if (ss->css_rstat_flush) { + list_del_rcu(&css->rstat_css_node); + synchronize_rcu(); + list_add_rcu(&css->rstat_css_node, + &dcgrp->rstat_css_list); + } + /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { @@ -1793,7 +1909,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); spin_unlock_irq(&css_set_lock); - if (len >= PATH_MAX) + if (len == -E2BIG) len = -ERANGE; else if (len > 0) { seq_escape(sf, buf, " \t\n\\"); @@ -1805,15 +1921,21 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, enum cgroup2_param { Opt_nsdelegate, + Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, + Opt_memory_hugetlb_accounting, + Opt_pids_localevents, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), + fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), + fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), + fsparam_flag("pids_localevents", Opt_pids_localevents), {} }; @@ -1831,16 +1953,32 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; + case Opt_favordynmods: + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; + case Opt_memory_hugetlb_accounting: + ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + return 0; + case Opt_pids_localevents: + ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + return 0; } return -EINVAL; } +struct cgroup_of_peak *of_peak(struct kernfs_open_file *of) +{ + struct cgroup_file_ctx *ctx = of->priv; + + return &ctx->peak; +} + static void apply_cgroup_root_flags(unsigned int root_flags) { if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { @@ -1849,6 +1987,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags) else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + cgroup_favor_dynmods(&cgrp_dfl_root, + root_flags & CGRP_ROOT_FAVOR_DYNMODS); + if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; else @@ -1858,6 +1999,16 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; + + if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + + if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS; } } @@ -1865,10 +2016,16 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS) + seq_puts(seq, ",favordynmods"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + seq_puts(seq, ",memory_hugetlb_accounting"); + if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + seq_puts(seq, ",pids_localevents"); return 0; } @@ -1910,12 +2067,13 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; - INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); - root->flags = ctx->flags; + /* DYNMODS must be modified through cgroup_favor_dynmods() */ + root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent) strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name) @@ -1966,20 +2124,26 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ret = PTR_ERR(root->kf_root); goto exit_root_id; } - root_cgrp->kn = root->kf_root->kn; + root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); - root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); + root_cgrp->ancestors[0] = root_cgrp; ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; - ret = rebind_subsystems(root, ss_mask); + ret = cgroup_rstat_init(root_cgrp); if (ret) goto destroy_root; - ret = cgroup_bpf_inherit(root_cgrp); - WARN_ON_ONCE(ret); + ret = rebind_subsystems(root, ss_mask); + if (ret) + goto exit_stats; + + if (root == &cgrp_dfl_root) { + ret = cgroup_bpf_inherit(root_cgrp); + WARN_ON_ONCE(ret); + } trace_cgroup_setup_root(root); @@ -1988,7 +2152,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ - list_add(&root->root_list, &cgroup_roots); + list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* @@ -2006,10 +2170,11 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); - kernfs_activate(root_cgrp->kn); ret = 0; goto out; +exit_stats: + cgroup_rstat_exit(root_cgrp); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -2043,13 +2208,13 @@ int cgroup_do_get_tree(struct fs_context *fc) struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; - mutex_lock(&cgroup_mutex); + cgroup_lock(); spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(fc->root); @@ -2086,7 +2251,7 @@ static int cgroup_get_tree(struct fs_context *fc) struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; - cgrp_dfl_visible = true; + WRITE_ONCE(cgrp_dfl_visible, true); cgroup_get_live(&cgrp_dfl_root.cgrp); ctx->root = &cgrp_dfl_root; @@ -2132,6 +2297,10 @@ static int cgroup_init_fs_context(struct fs_context *fc) put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; + + if (have_favordynmods) + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + return 0; } @@ -2143,7 +2312,6 @@ static void cgroup_kill_sb(struct super_block *sb) /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). - * cgroup_mount() may wait for @root's release. * * And don't kill the default root. */ @@ -2170,7 +2338,7 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, @@ -2227,56 +2395,58 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, { int ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } EXPORT_SYMBOL_GPL(cgroup_path_ns); /** - * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy - * @task: target task - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Determine @task's cgroup on the first (the one with the lowest non-zero - * hierarchy_id) cgroup hierarchy and copy its path into @buf. This - * function grabs cgroup_mutex and shouldn't be used inside locks used by - * cgroup controller callbacks. - * - * Return value is the same as kernfs_path(). + * cgroup_attach_lock - Lock for ->attach() + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * + * cgroup migration sometimes needs to stabilize threadgroups against forks and + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() + * implementations (e.g. cpuset), also need to disable CPU hotplug. + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can + * lead to deadlocks. + * + * Bringing up a CPU may involve creating and destroying tasks which requires + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while + * write-locking threadgroup_rwsem, the locking order is reversed and we end up + * waiting for an on-going CPU hotplug operation which in turn is waiting for + * the threadgroup_rwsem to be released to create new tasks. For more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + * + * Resolve the situation by always acquiring cpus_read_lock() before optionally + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that + * CPU hotplug is disabled on entry. */ -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +void cgroup_attach_lock(bool lock_threadgroup) { - struct cgroup_root *root; - struct cgroup *cgrp; - int hierarchy_id = 1; - int ret; - - mutex_lock(&cgroup_mutex); - spin_lock_irq(&css_set_lock); - - root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); - - if (root) { - cgrp = task_cgroup_from_root(task, root); - ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); - } else { - /* if no hierarchy exists, everyone is in "/" */ - ret = strlcpy(buf, "/", buflen); - } + cpus_read_lock(); + if (lock_threadgroup) + percpu_down_write(&cgroup_threadgroup_rwsem); +} - spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); - return ret; +/** + * cgroup_attach_unlock - Undo cgroup_attach_lock() + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + */ +void cgroup_attach_unlock(bool lock_threadgroup) +{ + if (lock_threadgroup) + percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); } -EXPORT_SYMBOL_GPL(task_cgroup_path); /** * cgroup_migrate_add_task - add a migration target task to a migration context @@ -2347,7 +2517,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; - while (&cset->mg_node != tset->csets) { + while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) { if (!task) task = list_first_entry(&cset->mg_tasks, struct task_struct, cg_list); @@ -2360,7 +2530,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /* * This function may be called both before and - * after cgroup_taskset_migrate(). The two cases + * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ @@ -2380,7 +2550,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, } /** - * cgroup_taskset_migrate - migrate a taskset + * cgroup_migrate_execute - migrate a taskset * @mgctx: migration context * * Migrate tasks in @mgctx as setup by migration preparation functions. @@ -2505,10 +2675,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP; - /* mixables don't care */ - if (cgroup_is_mixable(dst_cgrp)) - return 0; - /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. @@ -2532,21 +2698,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { - LIST_HEAD(preloaded); struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); - list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, + mg_src_preload_node) { + cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; + cset->mg_dst_cset = NULL; + list_del_init(&cset->mg_src_preload_node); + put_css_set_locked(cset); + } - list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, + mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; - list_del_init(&cset->mg_preload_node); + list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } @@ -2586,11 +2758,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - - if (!list_empty(&src_cset->mg_preload_node)) + if (!list_empty(&src_cset->mg_src_preload_node)) return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); @@ -2599,7 +2771,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); + list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** @@ -2624,7 +2796,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, - mg_preload_node) { + mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; @@ -2643,7 +2815,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; - list_del_init(&src_cset->mg_preload_node); + list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; @@ -2651,8 +2823,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) src_cset->mg_dst_cset = dst_cset; - if (list_empty(&dst_cset->mg_preload_node)) - list_add_tail(&dst_cset->mg_preload_node, + if (list_empty(&dst_cset->mg_dst_preload_node)) + list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); @@ -2689,19 +2861,17 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct task_struct *task; /* - * Prevent freeing of tasks while we take a snapshot. Tasks that are - * already PF_EXITING could be freed from underneath us unless we - * take an rcu_read_lock. + * The following thread iteration should be inside an RCU critical + * section to prevent tasks from being freed while taking the snapshot. + * spin_lock_irq() implies RCU critical section here. */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); return cgroup_migrate_execute(mgctx); @@ -2748,8 +2918,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) - __acquires(&cgroup_threadgroup_rwsem) + bool *threadgroup_locked) { struct task_struct *tsk; pid_t pid; @@ -2766,12 +2935,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); - if (pid || threadgroup) { - percpu_down_write(&cgroup_threadgroup_rwsem); - *locked = true; - } else { - *locked = false; - } + *threadgroup_locked = pid || threadgroup; + cgroup_attach_lock(*threadgroup_locked); rcu_read_lock(); if (pid) { @@ -2802,17 +2967,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, goto out_unlock_rcu; out_unlock_threadgroup: - if (*locked) { - percpu_up_write(&cgroup_threadgroup_rwsem); - *locked = false; - } + cgroup_attach_unlock(*threadgroup_locked); + *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool locked) - __releases(&cgroup_threadgroup_rwsem) +void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) { struct cgroup_subsys *ss; int ssid; @@ -2820,8 +2982,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - if (locked) - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(threadgroup_locked); + for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -2876,29 +3038,47 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + bool has_tasks; int ret; lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; + /* + * As cgroup_update_dfl_csses() is only called by + * cgroup_apply_control(). The csses associated with the + * given cgrp will not be affected by changes made to + * its subtree_control file. We can skip them. + */ + if (dsct == cgrp) + continue; + list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); + /* + * We need to write-lock threadgroup_rwsem while migrating tasks. + * However, if there are no source csets for @cgrp, changing its + * controllers isn't gonna produce any task migrations and the + * write-locking can be skipped safely. + */ + has_tasks = !list_empty(&mgctx.preloaded_src_csets); + cgroup_attach_lock(has_tasks); + /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { + list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, + mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ @@ -2910,7 +3090,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(has_tasks); return ret; } @@ -2931,7 +3111,7 @@ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) int ssid; restart: - mutex_lock(&cgroup_mutex); + cgroup_lock(); cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { @@ -2945,7 +3125,7 @@ restart: prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); schedule(); finish_wait(&dsct->offline_waitq, &wait); @@ -3147,11 +3327,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ - ret = cgroup_update_dfl_csses(cgrp); - if (ret) - return ret; - - return 0; + return cgroup_update_dfl_csses(cgrp); } /** @@ -3500,18 +3676,69 @@ static int cgroup_events_show(struct seq_file *seq, void *v) static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; + struct cgroup_subsys_state *css; + int dying_cnt[CGROUP_SUBSYS_COUNT]; + int ssid; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); + + /* + * Show the number of live and dying csses associated with each of + * non-inhibited cgroup subsystems that is bound to cgroup v2. + * + * Without proper lock protection, racing is possible. So the + * numbers may not be consistent when that happens. + */ + rcu_read_lock(); + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + dying_cnt[ssid] = -1; + if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) || + (cgroup_subsys[ssid]->root != &cgrp_dfl_root)) + continue; + css = rcu_dereference_raw(cgroup->subsys[ssid]); + dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid]; + seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name, + css ? (css->nr_descendants + 1) : 0); + } + seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); - + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + if (dying_cnt[ssid] >= 0) + seq_printf(seq, "nr_dying_subsys_%s %d\n", + cgroup_subsys[ssid]->name, dying_cnt[ssid]); + } + rcu_read_unlock(); return 0; } -static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, - struct cgroup *cgrp, int ssid) +#ifdef CONFIG_CGROUP_SCHED +/** + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get @cgrp's css associated with @ss. If the css doesn't exist + * or is offline, %NULL is returned. + */ +static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css = cgroup_css(cgrp, ss); + if (css && !css_tryget_online(css)) + css = NULL; + rcu_read_unlock(); + + return css; +} + +static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) { + struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; @@ -3528,14 +3755,44 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, return ret; } +static int cgroup_local_stat_show(struct seq_file *seq, + struct cgroup *cgrp, int ssid) +{ + struct cgroup_subsys *ss = cgroup_subsys[ssid]; + struct cgroup_subsys_state *css; + int ret; + + if (!ss->css_local_stat_show) + return 0; + + css = cgroup_tryget_css(cgrp, ss); + if (!css) + return 0; + + ret = ss->css_local_stat_show(seq, css); + css_put(css); + return ret; +} +#endif + static int cpu_stat_show(struct seq_file *seq, void *v) { - struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED - ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); + ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); +#endif + return ret; +} + +static int cpu_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; + int ret = 0; + +#ifdef CONFIG_CGROUP_SCHED + ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id); #endif return ret; } @@ -3544,30 +3801,32 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_CPU); } -static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, enum psi_res res) +static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) { + struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; + struct psi_group *psi; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) @@ -3576,14 +3835,20 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); - new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + + psi = cgroup_psi(cgrp); + new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); } - psi_trigger_replace(&of->priv, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; @@ -3593,33 +3858,117 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_IO); + return pressure_write(of, buf, nbytes, PSI_IO); } static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); + return pressure_write(of, buf, nbytes, PSI_MEM); } static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); + return pressure_write(of, buf, nbytes, PSI_CPU); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_psi(cgrp); + + return psi_show(seq, psi, PSI_IRQ); +} + +static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return pressure_write(of, buf, nbytes, PSI_IRQ); +} +#endif + +static int cgroup_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_psi(cgrp); + + seq_printf(seq, "%d\n", psi->enabled); + + return 0; +} + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + ssize_t ret; + int enable; + struct cgroup *cgrp; + struct psi_group *psi; + + ret = kstrtoint(strstrip(buf), 0, &enable); + if (ret) + return ret; + + if (enable < 0 || enable > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + psi = cgroup_psi(cgrp); + if (psi->enabled != enable) { + int i; + + /* show or hide {cpu,memory,io,irq}.pressure files */ + for (i = 0; i < NR_PSI_RESOURCES; i++) + cgroup_file_show(&cgrp->psi_files[i], enable); + + psi->enabled = enable; + if (enable) + psi_cgroup_restart(psi); + } + + cgroup_kn_unlock(of->kn); + + return nbytes; } static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { - return psi_trigger_poll(&of->priv, of->file, pt); + struct cgroup_file_ctx *ctx = of->priv; + + return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static void cgroup_pressure_release(struct kernfs_open_file *of) { - psi_trigger_replace(&of->priv, NULL); + struct cgroup_file_ctx *ctx = of->priv; + + psi_trigger_destroy(ctx->psi.trigger); } + +bool cgroup_psi_enabled(void) +{ + if (static_branch_likely(&psi_disabled)) + return false; + + return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; +} + +#else /* CONFIG_PSI */ +bool cgroup_psi_enabled(void) +{ + return false; +} + #endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) @@ -3656,41 +4005,133 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, return nbytes; } +static void __cgroup_kill(struct cgroup *cgrp) +{ + struct css_task_iter it; + struct task_struct *task; + + lockdep_assert_held(&cgroup_mutex); + + spin_lock_irq(&css_set_lock); + cgrp->kill_seq++; + spin_unlock_irq(&css_set_lock); + + css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); + while ((task = css_task_iter_next(&it))) { + /* Ignore kernel threads here. */ + if (task->flags & PF_KTHREAD) + continue; + + /* Skip tasks that are already dying. */ + if (__fatal_signal_pending(task)) + continue; + + send_sig(SIGKILL, task, 0); + } + css_task_iter_end(&it); +} + +static void cgroup_kill(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *css; + struct cgroup *dsct; + + lockdep_assert_held(&cgroup_mutex); + + cgroup_for_each_live_descendant_pre(dsct, css, cgrp) + __cgroup_kill(dsct); +} + +static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + ssize_t ret = 0; + int kill; + struct cgroup *cgrp; + + ret = kstrtoint(strstrip(buf), 0, &kill); + if (ret) + return ret; + + if (kill != 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + /* + * Killing is a process directed operation, i.e. the whole thread-group + * is taken down so act like we do for cgroup.procs and only make this + * writable in non-threaded cgroups. + */ + if (cgroup_is_threaded(cgrp)) + ret = -EOPNOTSUPP; + else + cgroup_kill(cgrp); + + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + static int cgroup_file_open(struct kernfs_open_file *of) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx; + int ret; - if (cft->open) - return cft->open(of); - return 0; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); + of->priv = ctx; + + if (!cft->open) + return 0; + + ret = cft->open(of); + if (ret) { + put_cgroup_ns(ctx->ns); + kfree(ctx); + } + return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); + put_cgroup_ns(ctx->ns); + kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; + if (!nbytes) + return 0; + /* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - - * cgroup.procs and cgroup.subtree_control. + * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) @@ -3725,7 +4166,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); if (cft->poll) return cft->poll(of, pt); @@ -3787,20 +4228,6 @@ static struct kernfs_ops cgroup_kf_ops = { .seq_show = cgroup_seqfile_show, }; -/* set uid and gid of cgroup dirs and files to that of the creator */ -static int cgroup_kn_set_ugid(struct kernfs_node *kn) -{ - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = current_fsuid(), - .ia_gid = current_fsgid(), }; - - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) - return 0; - - return kernfs_setattr(kn, &iattr); -} - static void cgroup_file_notify_timer(struct timer_list *timer) { cgroup_file_notify(container_of(timer, struct cgroup_file, @@ -3813,25 +4240,18 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; - int ret; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_fsuid(), current_fsgid(), 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); - ret = cgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; @@ -3931,19 +4351,26 @@ static void cgroup_exit_cftypes(struct cftype *cfts) cft->ss = NULL; /* revert flags set by cgroup core while adding @cfts */ - cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); + cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL | + __CFTYPE_ADDED); } } static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; + int ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; WARN_ON(cft->ss || cft->kf_ops); + if (cft->flags & __CFTYPE_ADDED) { + ret = -EBUSY; + break; + } + if (cft->seq_start) kf_ops = &cgroup_kf_ops; else @@ -3956,30 +4383,29 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); if (!kf_ops) { - cgroup_exit_cftypes(cfts); - return -ENOMEM; + ret = -ENOMEM; + break; } kf_ops->atomic_write_len = cft->max_write_len; } cft->kf_ops = kf_ops; cft->ss = ss; + cft->flags |= __CFTYPE_ADDED; } - return 0; + if (ret) + cgroup_exit_cftypes(cfts); + return ret; } -static int cgroup_rm_cftypes_locked(struct cftype *cfts) +static void cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); - if (!cfts || !cfts[0].ss) - return -ENOENT; - list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); - return 0; } /** @@ -3995,12 +4421,16 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts) */ int cgroup_rm_cftypes(struct cftype *cfts) { - int ret; + if (!cfts || cfts[0].name[0] == '\0') + return 0; - mutex_lock(&cgroup_mutex); - ret = cgroup_rm_cftypes_locked(cfts); - mutex_unlock(&cgroup_mutex); - return ret; + if (!(cfts[0].flags & __CFTYPE_ADDED)) + return -ENOENT; + + cgroup_lock(); + cgroup_rm_cftypes_locked(cfts); + cgroup_unlock(); + return 0; } /** @@ -4031,14 +4461,14 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (ret) return ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -4102,6 +4532,26 @@ void cgroup_file_notify(struct cgroup_file *cfile) } /** + * cgroup_file_show - show or hide a hidden cgroup file + * @cfile: target cgroup_file obtained by setting cftype->file_offset + * @show: whether to show or hide + */ +void cgroup_file_show(struct cgroup_file *cfile, bool show) +{ + struct kernfs_node *kn; + + spin_lock_irq(&cgroup_file_kn_lock); + kn = cfile->kn; + kernfs_get(kn); + spin_unlock_irq(&cgroup_file_kn_lock); + + if (kn) + kernfs_show(kn, show); + + kernfs_put(kn); +} + +/** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk @@ -4135,7 +4585,7 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, * implies that if we observe !CSS_RELEASED on @pos in this RCU * critical section, the one pointed to by its next pointer is * guaranteed to not have finished its RCU grace period even if we - * have dropped rcu_read_lock() inbetween iterations. + * have dropped rcu_read_lock() in-between iterations. * * If @pos has CSS_RELEASED set, its next pointer can't be * dereferenced; however, as each css is given a monotonically @@ -4176,8 +4626,9 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct next descendant as long - * as both @pos and @root are accessible and @pos is a descendant of @root. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct next descendant as long as both @pos + * and @root are accessible and @pos is a descendant of @root. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the @@ -4225,8 +4676,9 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre); * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct rightmost descendant as - * long as @pos is accessible. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct rightmost descendant as long as @pos + * is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) @@ -4270,9 +4722,9 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct next descendant as long - * as both @pos and @cgroup are accessible and @pos is a descendant of - * @cgroup. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the @@ -4383,7 +4835,7 @@ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) } /** - * css_task_iter_advance_css_set - advance a task itererator to the next css_set + * css_task_iter_advance_css_set - advance a task iterator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. @@ -4515,14 +4967,16 @@ repeat: void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { + unsigned long irqflags; + memset(it, 0, sizeof(*it)); - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); it->ss = css->ss; it->flags = flags; - if (it->ss) + if (CGROUP_HAS_SUBSYS_CONFIG && it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; else it->cset_pos = &css->cgroup->cset_links; @@ -4531,7 +4985,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, css_task_iter_advance(it); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); } /** @@ -4544,12 +4998,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { + unsigned long irqflags; + if (it->cur_task) { put_task_struct(it->cur_task); it->cur_task = NULL; } - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); /* @it may be half-advanced by skips, finish advancing */ if (it->flags & CSS_TASK_ITER_SKIPPED) @@ -4562,7 +5018,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) css_task_iter_advance(it); } - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); return it->cur_task; } @@ -4575,11 +5031,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) */ void css_task_iter_end(struct css_task_iter *it) { + unsigned long irqflags; + if (it->cur_cset) { - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); } if (it->cur_dcset) @@ -4591,21 +5049,21 @@ void css_task_iter_end(struct css_task_iter *it) static void cgroup_procs_release(struct kernfs_open_file *of) { - if (of->priv) { - css_task_iter_end(of->priv); - kfree(of->priv); - } + struct cgroup_file_ctx *ctx = of->priv; + + if (ctx->procs.started) + css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; - return css_task_iter_next(it); + return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, @@ -4613,21 +5071,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ - if (!it) { + if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); - - it = kzalloc(sizeof(*it), GFP_KERNEL); - if (!it) - return ERR_PTR(-ENOMEM); - of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); + ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); @@ -4671,16 +5126,16 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) if (!inode) return -ENOMEM; - ret = inode_permission(inode, MAY_WRITE); + ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE); iput(inode); return ret; } static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb) + struct super_block *sb, + struct cgroup_namespace *ns) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; int ret; @@ -4709,11 +5164,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, static int cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb, bool threadgroup) + struct super_block *sb, bool threadgroup, + struct cgroup_namespace *ns) { int ret = 0; - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret; @@ -4727,19 +5183,21 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp, return ret; } -static ssize_t cgroup_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + bool threadgroup) { + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; + const struct cred *saved_cred; ssize_t ret; - bool locked; + bool threadgroup_locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, true, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4749,19 +5207,33 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); + /* + * Process and thread migrations follow same delegation rule. Check + * permissions using the credentials from file open to protect against + * inherited fd attacks. + */ + saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, true); + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); + revert_creds(saved_cred); if (ret) goto out_finish; - ret = cgroup_attach_task(dst_cgrp, task, true); + ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, threadgroup_locked); out_unlock: cgroup_kn_unlock(of->kn); - return ret ?: nbytes; + return ret; +} + +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup_procs_write(of, buf, true) ?: nbytes; } static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) @@ -4772,41 +5244,7 @@ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) static ssize_t cgroup_threads_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup *src_cgrp, *dst_cgrp; - struct task_struct *task; - ssize_t ret; - bool locked; - - buf = strstrip(buf); - - dst_cgrp = cgroup_kn_lock_live(of->kn, false); - if (!dst_cgrp) - return -ENODEV; - - task = cgroup_procs_write_start(buf, false, &locked); - ret = PTR_ERR_OR_ZERO(task); - if (ret) - goto out_unlock; - - /* find the source cgroup */ - spin_lock_irq(&css_set_lock); - src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); - - /* thread migrations follow the cgroup.procs delegation rule */ - ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, false); - if (ret) - goto out_finish; - - ret = cgroup_attach_task(dst_cgrp, task, false); - -out_finish: - cgroup_procs_write_finish(task, locked); -out_unlock: - cgroup_kn_unlock(of->kn); - - return ret ?: nbytes; + return __cgroup_procs_write(of, buf, false) ?: nbytes; } /* cgroup core interface files for the default hierarchy */ @@ -4873,12 +5311,26 @@ static struct cftype cgroup_base_files[] = { .write = cgroup_freeze_write, }, { + .name = "cgroup.kill", + .flags = CFTYPE_NOT_ON_ROOT, + .write = cgroup_kill_write, + }, + { .name = "cpu.stat", .seq_show = cpu_stat_show, }, + { + .name = "cpu.stat.local", + .seq_show = cpu_local_stat_show, + }, + { } /* terminate */ +}; + +static struct cftype cgroup_psi_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -4886,6 +5338,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "memory.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -4893,11 +5346,27 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif + { + .name = "cgroup.pressure", + .seq_show = cgroup_pressure_show, + .write = cgroup_pressure_write, + }, #endif /* CONFIG_PSI */ { } /* terminate */ }; @@ -4918,7 +5387,7 @@ static struct cftype cgroup_base_files[] = { * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in - * css_free_work_fn(). + * css_free_rwork_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional @@ -4947,8 +5416,10 @@ static void css_free_rwork_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup1_pidlist_destroy_all(cgrp); + if (!cgroup_on_dfl(cgrp)) + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); + bpf_cgrp_storage_free(cgrp); if (cgroup_parent(cgrp)) { /* @@ -4960,8 +5431,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -4981,12 +5451,14 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; - mutex_lock(&cgroup_mutex); + cgroup_lock(); css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); if (ss) { + struct cgroup *parent_cgrp; + /* css release path */ if (!list_empty(&css->rstat_css_node)) { cgroup_rstat_flush(cgrp); @@ -4996,14 +5468,28 @@ static void css_release_work_fn(struct work_struct *work) cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); + + cgrp->nr_dying_subsys[ss->id]--; + /* + * When a css is released and ready to be freed, its + * nr_descendants must be zero. However, the corresponding + * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem + * is activated and deactivated multiple times with one or + * more of its previous activation leaving behind dying csses. + */ + WARN_ON_ONCE(css->nr_descendants); + parent_cgrp = cgroup_parent(cgrp); + while (parent_cgrp) { + parent_cgrp->nr_dying_subsys[ss->id]--; + parent_cgrp = cgroup_parent(parent_cgrp); + } } else { struct cgroup *tcgrp; /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_flush(cgrp); + cgroup_rstat_flush(cgrp); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5023,7 +5509,7 @@ static void css_release_work_fn(struct work_struct *work) NULL); } - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); @@ -5060,7 +5546,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } - if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) + if (ss->css_rstat_flush) list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); BUG_ON(cgroup_css(cgrp, ss)); @@ -5081,8 +5567,11 @@ static int online_css(struct cgroup_subsys_state *css) rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); - if (css->parent) + if (css->parent) { atomic_inc(&css->parent->online_cnt); + while ((css = css->parent)) + css->nr_descendants++; + } } return ret; } @@ -5104,6 +5593,16 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); + + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } } /** @@ -5150,15 +5649,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, if (err) goto err_list_del; - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - cgroup_parent(parent)) { - pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); - if (!strcmp(ss->name, "memory")) - pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); - ss->warned_broken_hierarchy = true; - } - return css; err_list_del: @@ -5172,8 +5662,7 @@ err_free_css: /* * The returned cgroup is fully initialized including its control mask, but - * it isn't associated with its kernfs_node and doesn't have the control - * mask applied. + * it doesn't have the control mask applied. */ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, umode_t mode) @@ -5185,8 +5674,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)), - GFP_KERNEL); + cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); @@ -5194,14 +5682,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - if (cgroup_on_dfl(parent)) { - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - } + ret = cgroup_rstat_init(cgrp); + if (ret) + goto out_cancel_ref; /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + kn = kernfs_create_dir_ns(parent->kn, name, mode, + current_fsuid(), current_fsgid(), + cgrp, NULL); if (IS_ERR(kn)) { ret = PTR_ERR(kn); goto out_stat_exit; @@ -5218,9 +5706,11 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_kernfs_remove; - ret = cgroup_bpf_inherit(cgrp); - if (ret) - goto out_psi_free; + if (cgrp->root == &cgrp_dfl_root) { + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_psi_free; + } /* * New cgroup inherits effective freeze counter, and @@ -5240,7 +5730,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); + cgrp->ancestors[tcgrp->level] = tcgrp; if (tcgrp != cgrp) { tcgrp->nr_descendants++; @@ -5285,8 +5775,7 @@ out_psi_free: out_kernfs_remove: kernfs_remove(cgrp->kn); out_stat_exit: - if (cgroup_on_dfl(parent)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5298,7 +5787,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) { struct cgroup *cgroup; int ret = false; - int level = 1; + int level = 0; lockdep_assert_held(&cgroup_mutex); @@ -5306,7 +5795,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) if (cgroup->nr_descendants >= cgroup->max_descendants) goto fail; - if (level > cgroup->max_depth) + if (level >= cgroup->max_depth) goto fail; level++; @@ -5347,10 +5836,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) */ kernfs_get(cgrp->kn); - ret = cgroup_kn_set_ugid(cgrp->kn); - if (ret) - goto out_destroy; - ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; @@ -5377,14 +5862,14 @@ out_unlock: /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to - * initate destruction and put the css ref from kill_css(). + * initiate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); - mutex_lock(&cgroup_mutex); + cgroup_lock(); do { offline_css(css); @@ -5393,7 +5878,7 @@ static void css_killed_work_fn(struct work_struct *work) css = css->parent; } while (css && atomic_dec_and_test(&css->online_cnt)); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); } /* css kill confirmation processing requires process context, bounce */ @@ -5503,7 +5988,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling - * cgroup_lock_live_group(). The latter makes the csets ignored by + * cgroup_kn_lock_live(). The latter makes the csets ignored by * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; @@ -5521,11 +6006,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); - if (parent && cgroup_is_threaded(cgrp)) + if (cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); - for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { + for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; /* @@ -5539,7 +6024,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup1_check_for_release(parent); - cgroup_bpf_offline(cgrp); + if (cgrp->root == &cgrp_dfl_root) + cgroup_bpf_offline(cgrp); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -5577,14 +6063,14 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) pr_debug("Initializing cgroup subsys %s\n", ss->name); - mutex_lock(&cgroup_mutex); + cgroup_lock(); idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); /* Create the root cgroup state for this subsystem */ ss->root = &cgrp_dfl_root; - css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); + css = ss->css_alloc(NULL); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); @@ -5621,7 +6107,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) BUG_ON(online_css(css)); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); } /** @@ -5661,8 +6147,6 @@ int __init cgroup_init_early(void) return 0; } -static u16 cgroup_disable_mask __initdata; - /** * cgroup_init - cgroup initialization * @@ -5676,19 +6160,14 @@ int __init cgroup_init(void) BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); cgroup_rstat_boot(); - /* - * The latency of the synchronize_rcu() is too high for cgroups, - * avoid it at the cost of forcing all readers into the slow path. - */ - rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); - get_user_ns(init_cgroup_ns.user_ns); - mutex_lock(&cgroup_mutex); + cgroup_lock(); /* * Add init_css_set to the hash table so that dfl_root can link to @@ -5699,7 +6178,7 @@ int __init cgroup_init(void) BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); for_each_subsys(ss, ssid) { if (ss->early_init) { @@ -5721,16 +6200,12 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (cgroup_disable_mask & (1 << ssid)) { - static_branch_disable(cgroup_subsys_enabled_key[ssid]); - printk(KERN_INFO "Disabling %s control group subsystem\n", - ss->name); + if (!cgroup_ssid_enabled(ssid)) continue; - } if (cgroup1_ssid_disabled(ssid)) - printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", - ss->name); + pr_info("Disabling %s control group subsystem in v1 mounts\n", + ss->legacy_name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; @@ -5755,9 +6230,9 @@ int __init cgroup_init(void) if (ss->bind) ss->bind(init_css_set.subsys[ssid]); - mutex_lock(&cgroup_mutex); + cgroup_lock(); css_populate_dir(init_css_set.subsys[ssid]); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); } /* init_css_set.subsys[] has been updated, re-hash */ @@ -5769,7 +6244,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 WARN_ON(register_filesystem(&cpuset_fs_type)); #endif @@ -5804,6 +6279,48 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) } /* + * cgroup_get_from_id : get the cgroup associated with cgroup id + * @id: cgroup id + * On success return the cgrp or ERR_PTR on failure + * Only cgroups within current task's cgroup NS are valid. + */ +struct cgroup *cgroup_get_from_id(u64 id) +{ + struct kernfs_node *kn; + struct cgroup *cgrp, *root_cgrp; + + kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); + if (!kn) + return ERR_PTR(-ENOENT); + + if (kernfs_type(kn) != KERNFS_DIR) { + kernfs_put(kn); + return ERR_PTR(-ENOENT); + } + + rcu_read_lock(); + + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + if (cgrp && !cgroup_tryget(cgrp)) + cgrp = NULL; + + rcu_read_unlock(); + kernfs_put(kn); + + if (!cgrp) + return ERR_PTR(-ENOENT); + + root_cgrp = current_cgns_cgroup_dfl(); + if (!cgroup_is_descendant(cgrp, root_cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-ENOENT); + } + + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_id); + +/* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup. @@ -5820,7 +6337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - mutex_lock(&cgroup_mutex); + rcu_read_lock(); spin_lock_irq(&css_set_lock); for_each_root(root) { @@ -5828,7 +6345,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root && !cgrp_dfl_visible) + if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) + continue; + + cgrp = task_cgroup_from_root(tsk, root); + /* The root has already been unmounted. */ + if (!cgrp) continue; seq_printf(m, "%d:", root->hierarchy_id); @@ -5841,9 +6363,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); - - cgrp = task_cgroup_from_root(tsk, root); - /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, @@ -5856,7 +6375,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (retval >= PATH_MAX) + if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) goto out_unlock; @@ -5875,7 +6394,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + rcu_read_unlock(); kfree(buf); out: return retval; @@ -5894,16 +6413,37 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } -static struct cgroup *cgroup_get_from_file(struct file *f) +/** + * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer + * @f: file corresponding to cgroup_dir + * + * Find the cgroup from a file pointer associated with a cgroup directory. + * Returns a pointer to the cgroup on success. ERR_PTR is returned if the + * cgroup cannot be found. + */ +static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) { struct cgroup_subsys_state *css; - struct cgroup *cgrp; css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css); - cgrp = css->cgroup; + return css->cgroup; +} + +/** + * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports + * cgroup2. + * @f: file corresponding to cgroup2_dir + */ +static struct cgroup *cgroup_get_from_file(struct file *f) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_file(f); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); @@ -5935,16 +6475,19 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) struct cgroup *dst_cgrp = NULL; struct css_set *cset; struct super_block *sb; - struct file *f; if (kargs->flags & CLONE_INTO_CGROUP) - mutex_lock(&cgroup_mutex); + cgroup_lock(); cgroup_threadgroup_change_begin(current); spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); + if (kargs->cgrp) + kargs->kill_seq = kargs->cgrp->kill_seq; + else + kargs->kill_seq = cset->dfl_cgrp->kill_seq; spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { @@ -5952,14 +6495,14 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) return 0; } - f = fget_raw(kargs->cgroup); - if (!f) { + CLASS(fd_raw, f)(kargs->cgroup); + if (fd_empty(f)) { ret = -EBADF; goto err; } - sb = f->f_path.dentry->d_sb; + sb = fd_file(f)->f_path.dentry->d_sb; - dst_cgrp = cgroup_get_from_file(f); + dst_cgrp = cgroup_get_from_file(fd_file(f)); if (IS_ERR(dst_cgrp)) { ret = PTR_ERR(dst_cgrp); dst_cgrp = NULL; @@ -5980,8 +6523,23 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) if (ret) goto err; + /* + * Spawning a task directly into a cgroup works by passing a file + * descriptor to the target cgroup directory. This can even be an O_PATH + * file descriptor. But it can never be a cgroup.procs file descriptor. + * This was done on purpose so spawning into a cgroup could be + * conceptualized as an atomic + * + * fd = openat(dfd_cgroup, "cgroup.procs", ...); + * write(fd, <child-pid>, ...); + * + * sequence, i.e. it's a shorthand for the caller opening and writing + * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us + * to always use the caller's credentials. + */ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, - !(kargs->flags & CLONE_THREAD)); + !(kargs->flags & CLONE_THREAD), + current->nsproxy->cgroup_ns); if (ret) goto err; @@ -5992,15 +6550,12 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) } put_css_set(cset); - fput(f); kargs->cgrp = dst_cgrp; return ret; err: cgroup_threadgroup_change_end(current); - mutex_unlock(&cgroup_mutex); - if (f) - fput(f); + cgroup_unlock(); if (dst_cgrp) cgroup_put(dst_cgrp); put_css_set(cset); @@ -6019,19 +6574,18 @@ err: static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { - cgroup_threadgroup_change_end(current); + struct cgroup *cgrp = kargs->cgrp; + struct css_set *cset = kargs->cset; - if (kargs->flags & CLONE_INTO_CGROUP) { - struct cgroup *cgrp = kargs->cgrp; - struct css_set *cset = kargs->cset; - - mutex_unlock(&cgroup_mutex); + cgroup_threadgroup_change_end(current); - if (cset) { - put_css_set(cset); - kargs->cset = NULL; - } + if (cset) { + put_css_set(cset); + kargs->cset = NULL; + } + if (kargs->flags & CLONE_INTO_CGROUP) { + cgroup_unlock(); if (cgrp) { cgroup_put(cgrp); kargs->cgrp = NULL; @@ -6042,6 +6596,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process + * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). @@ -6085,7 +6640,7 @@ out_revert: * @kargs: the arguments passed to create the child process * * This calls the cancel_fork() callbacks if a fork failed *after* - * cgroup_can_fork() succeded and cleans up references we took to + * cgroup_can_fork() succeeded and cleans up references we took to * prepare a new css_set for the child process in cgroup_can_fork(). */ void cgroup_cancel_fork(struct task_struct *child, @@ -6104,6 +6659,7 @@ void cgroup_cancel_fork(struct task_struct *child, /** * cgroup_post_fork - finalize cgroup setup for the child process * @child: the child process + * @kargs: the arguments passed to create the child process * * Attach the child process to its css_set calling the subsystem fork() * callbacks. @@ -6112,6 +6668,9 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned int cgrp_kill_seq = 0; + unsigned long cgrp_flags = 0; + bool kill = false; struct cgroup_subsys *ss; struct css_set *cset; int i; @@ -6123,6 +6682,14 @@ void cgroup_post_fork(struct task_struct *child, /* init tasks are special, only link regular threads */ if (likely(child->pid)) { + if (kargs->cgrp) { + cgrp_flags = kargs->cgrp->flags; + cgrp_kill_seq = kargs->cgrp->kill_seq; + } else { + cgrp_flags = cset->dfl_cgrp->flags; + cgrp_kill_seq = cset->dfl_cgrp->kill_seq; + } + WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); @@ -6131,23 +6698,32 @@ void cgroup_post_fork(struct task_struct *child, cset = NULL; } - /* - * If the cgroup has to be frozen, the new task has too. Let's set - * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the - * frozen state. - */ - if (unlikely(cgroup_task_freeze(child))) { - spin_lock(&child->sighand->siglock); - WARN_ON_ONCE(child->frozen); - child->jobctl |= JOBCTL_TRAP_FREEZE; - spin_unlock(&child->sighand->siglock); + if (!(child->flags & PF_KTHREAD)) { + if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) { + /* + * If the cgroup has to be frozen, the new task has + * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to + * get the task into the frozen state. + */ + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); + + /* + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later from + * do_freezer_trap(). So we avoid cgroup's transient + * switch from the frozen state and back. + */ + } /* - * Calling cgroup_update_frozen() isn't required here, - * because it will be called anyway a bit later from - * do_freezer_trap(). So we avoid cgroup's transient switch - * from the frozen state and back. + * If the cgroup is to be killed notice it now and take the + * child down right after we finished preparing it for + * userspace. */ + kill = kargs->kill_seq != cgrp_kill_seq; } spin_unlock_irq(&css_set_lock); @@ -6170,6 +6746,10 @@ void cgroup_post_fork(struct task_struct *child, put_css_set(rcset); } + /* Cgroup has to be killed so take down child immediately. */ + if (unlikely(kill)) + do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID); + cgroup_css_set_put_fork(kargs); } @@ -6191,11 +6771,17 @@ void cgroup_exit(struct task_struct *tsk) WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; + /* matches the signal->live check in css_task_iter_advance() */ + if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) + list_add_tail(&tsk->cg_list, &cset->dying_tasks); + + if (dl_task(tsk)) + dec_dl_tasks_cs(tsk); WARN_ON_ONCE(cgroup_task_frozen(tsk)); - if (unlikely(cgroup_task_freeze(tsk))) + if (unlikely(!(tsk->flags & PF_KTHREAD) && + test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); spin_unlock_irq(&css_set_lock); @@ -6215,10 +6801,12 @@ void cgroup_release(struct task_struct *task) ss->release(task); } while_each_subsys_mask(); - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); + if (!list_empty(&task->cg_list)) { + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); + } } void cgroup_free(struct task_struct *task) @@ -6241,7 +6829,19 @@ static int __init cgroup_disable(char *str) if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; - cgroup_disable_mask |= 1 << i; + + static_branch_disable(cgroup_subsys_enabled_key[i]); + pr_info("Disabling %s control group subsystem\n", + ss->name); + } + + for (i = 0; i < OPT_FEATURE_COUNT; i++) { + if (strcmp(token, cgroup_opt_feature_names[i])) + continue; + cgroup_feature_disable_mask |= 1 << i; + pr_info("Disabling %s control group feature\n", + cgroup_opt_feature_names[i]); + break; } } return 1; @@ -6258,6 +6858,12 @@ static int __init enable_cgroup_debug(char *str) } __setup("cgroup_debug", enable_cgroup_debug); +static int __init cgroup_favordynmods_setup(char *str) +{ + return (kstrtobool(str, &have_favordynmods) == 0); +} +__setup("cgroup_favordynmods=", cgroup_favordynmods_setup); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -6318,54 +6924,74 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on - * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) - * if @path points to a non-directory. + * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already + * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory. */ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; - struct cgroup *cgrp; + struct cgroup *cgrp = ERR_PTR(-ENOENT); + struct cgroup *root_cgrp; - mutex_lock(&cgroup_mutex); + root_cgrp = current_cgns_cgroup_dfl(); + kn = kernfs_walk_and_get(root_cgrp->kn, path); + if (!kn) + goto out; - kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); - if (kn) { - if (kernfs_type(kn) == KERNFS_DIR) { - cgrp = kn->priv; - cgroup_get_live(cgrp); - } else { - cgrp = ERR_PTR(-ENOTDIR); - } - kernfs_put(kn); - } else { - cgrp = ERR_PTR(-ENOENT); + if (kernfs_type(kn) != KERNFS_DIR) { + cgrp = ERR_PTR(-ENOTDIR); + goto out_kernfs; } - mutex_unlock(&cgroup_mutex); + rcu_read_lock(); + + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + if (!cgrp || !cgroup_tryget(cgrp)) + cgrp = ERR_PTR(-ENOENT); + + rcu_read_unlock(); + +out_kernfs: + kernfs_put(kn); +out: return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** - * cgroup_get_from_fd - get a cgroup pointer from a fd - * @fd: fd obtained by open(cgroup2_dir) + * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ +struct cgroup *cgroup_v1v2_get_from_fd(int fd) +{ + CLASS(fd_raw, f)(fd); + if (fd_empty(f)) + return ERR_PTR(-EBADF); + + return cgroup_v1v2_get_from_file(fd_file(f)); +} + +/** + * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports + * cgroup2. + * @fd: fd obtained by open(cgroup2_dir) + */ struct cgroup *cgroup_get_from_fd(int fd) { - struct cgroup *cgrp; - struct file *f; + struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); - f = fget_raw(fd); - if (!f) - return ERR_PTR(-EBADF); + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); - cgrp = cgroup_get_from_file(f); - fput(f); + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); @@ -6418,118 +7044,57 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - -DEFINE_SPINLOCK(cgroup_sk_update_lock); -static bool cgroup_sk_alloc_disabled __read_mostly; - -void cgroup_sk_alloc_disable(void) -{ - if (cgroup_sk_alloc_disabled) - return; - pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); - cgroup_sk_alloc_disabled = true; -} - -#else - -#define cgroup_sk_alloc_disabled false - -#endif - void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) { - skcd->no_refcnt = 1; - return; - } - - /* Don't associate the sock with unrelated interrupted task's cgroup. */ - if (in_interrupt()) - return; + struct cgroup *cgroup; rcu_read_lock(); + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) { + cgroup = &cgrp_dfl_root.cgrp; + cgroup_get(cgroup); + goto out; + } while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->val = (unsigned long)cset->dfl_cgrp; - cgroup_bpf_get(cset->dfl_cgrp); + cgroup = cset->dfl_cgrp; break; } cpu_relax(); } - +out: + skcd->cgroup = cgroup; + cgroup_bpf_get(cgroup); rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { - if (skcd->val) { - if (skcd->no_refcnt) - return; - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); - } + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(cgrp); + cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); - if (skcd->no_refcnt) - return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } #endif /* CONFIG_SOCK_CGROUP_DATA */ -#ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_attach(struct cgroup *cgrp, - struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, - enum bpf_attach_type type, - u32 flags) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); - mutex_unlock(&cgroup_mutex); - return ret; -} - -int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); - mutex_unlock(&cgroup_mutex); - return ret; -} - -int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_query(cgrp, attr, uattr); - mutex_unlock(&cgroup_mutex); - return ret; -} -#endif /* CONFIG_CGROUP_BPF */ - #ifdef CONFIG_SYSFS static ssize_t show_delegatable_files(struct cftype *files, char *buf, ssize_t size, const char *prefix) @@ -6560,8 +7125,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, int ssid; ssize_t ret = 0; - ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, - NULL); + ret = show_delegatable_files(cgroup_base_files, buf + ret, + PAGE_SIZE - ret, NULL); + if (cgroup_psi_enabled()) + ret += show_delegatable_files(cgroup_psi_files, buf + ret, + PAGE_SIZE - ret, NULL); for_each_subsys(ss, ssid) ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, @@ -6577,8 +7145,11 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, { return snprintf(buf, PAGE_SIZE, "nsdelegate\n" + "favordynmods\n" "memory_localevents\n" - "memory_recursiveprot\n"); + "memory_recursiveprot\n" + "memory_hugetlb_accounting\n" + "pids_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); |