diff options
Diffstat (limited to 'kernel/cgroup')
| -rw-r--r-- | kernel/cgroup/Makefile | 2 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup-internal.h | 28 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup-v1.c | 152 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 1713 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset-internal.h | 308 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset-v1.c | 607 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 4239 | ||||
| -rw-r--r-- | kernel/cgroup/debug.c | 4 | ||||
| -rw-r--r-- | kernel/cgroup/dmem.c | 830 | ||||
| -rw-r--r-- | kernel/cgroup/freezer.c | 115 | ||||
| -rw-r--r-- | kernel/cgroup/legacy_freezer.c | 44 | ||||
| -rw-r--r-- | kernel/cgroup/misc.c | 148 | ||||
| -rw-r--r-- | kernel/cgroup/namespace.c | 35 | ||||
| -rw-r--r-- | kernel/cgroup/pids.c | 176 | ||||
| -rw-r--r-- | kernel/cgroup/rdma.c | 2 | ||||
| -rw-r--r-- | kernel/cgroup/rstat.c | 726 |
16 files changed, 6457 insertions, 2672 deletions
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 12f8457ad1f9..ede31601a363 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -5,5 +5,7 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o obj-$(CONFIG_CGROUP_MISC) += misc.o +obj-$(CONFIG_CGROUP_DMEM) += dmem.o obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 6e36e854b512..22051b4f1ccb 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -12,7 +12,6 @@ #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; -extern bool cgroup_debug; extern void __init enable_debug_cgroup(void); /* @@ -82,6 +81,8 @@ struct cgroup_file_ctx { struct { struct cgroup_pidlist *pidlist; } procs1; + + struct cgroup_of_peak peak; }; /* @@ -165,15 +166,14 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -extern struct mutex cgroup_mutex; -extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; -extern struct file_system_type cgroup_fs_type; +extern bool cgrp_dfl_visible; /* iterate across the hierarchies */ #define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) + list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ + lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems @@ -223,8 +223,6 @@ static inline void get_css_set(struct css_set *cset) bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); -bool cgroup_is_thread_root(struct cgroup *cgrp); -bool cgroup_is_threaded(struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, @@ -234,6 +232,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn); int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +void cgroup_favor_dynmods(struct cgroup_root *root, bool favor); void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); @@ -250,10 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) + enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task, bool locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); @@ -269,9 +273,9 @@ int cgroup_task_count(const struct cgroup *cgrp); /* * rstat.c */ -int cgroup_rstat_init(struct cgroup *cgrp); -void cgroup_rstat_exit(struct cgroup *cgrp); -void cgroup_rstat_boot(void); +int css_rstat_init(struct cgroup_subsys_state *css); +void css_rstat_exit(struct cgroup_subsys_state *css); +int ss_rstat_init(struct cgroup_subsys *ss); void cgroup_base_stat_cputime_show(struct seq_file *seq); /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 41e0837a5a0b..a9e029b570c8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -10,6 +10,7 @@ #include <linux/sched/task.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> @@ -32,6 +33,9 @@ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; +/* Show unavailable controllers in /proc/cgroups */ +static bool proc_show_all; + /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. @@ -46,6 +50,12 @@ bool cgroup1_ssid_disabled(int ssid) return cgroup_no_v1_mask & (1 << ssid); } +static bool cgroup1_subsys_absent(struct cgroup_subsys *ss) +{ + /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */ + return ss->legacy_cftypes == NULL && ss->dfl_cftypes; +} + /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task @@ -58,8 +68,8 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) struct cgroup_root *root; int retval = 0; - mutex_lock(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_lock(); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); for_each_root(root) { struct cgroup *from_cgrp; @@ -71,8 +81,8 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); + cgroup_unlock(); return retval; } @@ -106,9 +116,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (ret) return ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -144,8 +154,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); + cgroup_unlock(); return ret; } @@ -360,10 +370,9 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, } css_task_iter_end(&it); length = n; - /* now sort & (if procs) strip out duplicates */ + /* now sort & strip out duplicates (tgids or recycled thread PIDs) */ sort(array, length, sizeof(pid_t), cmppid, NULL); - if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(array, length); + length = pidlist_uniq(array, length); l = cgroup_pidlist_find_create(cgrp, type); if (!l) { @@ -431,7 +440,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) if (l->list[mid] == pid) { index = mid; break; - } else if (l->list[mid] <= pid) + } else if (l->list[mid] < pid) index = mid + 1; else end = mid; @@ -494,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; - bool locked; + enum cgroup_attach_lock_mode lock_mode; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -523,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -546,14 +555,24 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; + struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + ctx = of->priv; + if ((ctx->ns->user_ns != &init_user_ns) || + !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); - strlcpy(cgrp->root->release_agent_path, strstrip(buf), + strscpy(cgrp->root->release_agent_path, strstrip(buf), sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); cgroup_kn_unlock(of->kn); @@ -658,6 +677,7 @@ struct cftype cgroup1_base_files[] = { int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; + bool cgrp_v1_visible = false; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); @@ -666,11 +686,21 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) * cgroup_mutex contention. */ - for_each_subsys(ss, i) + for_each_subsys(ss, i) { + cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + + if (!proc_show_all && cgroup1_subsys_absent(ss)) + continue; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); + } + + if (cgrp_dfl_visible && !cgrp_v1_visible) + pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n"); + return 0; } @@ -787,13 +817,13 @@ void cgroup1_release_agent(struct work_struct *work) goto out_free; spin_lock(&release_agent_path_lock); - strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); + strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); spin_unlock(&release_agent_path_lock); if (!agentbuf[0]) goto out_free; ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - if (ret < 0 || ret >= PATH_MAX) + if (ret < 0) goto out_free; argv[0] = agentbuf; @@ -826,7 +856,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; - if (kn->parent != new_parent) + if (rcu_access_pointer(kn->__parent) != new_parent) return -EIO; /* @@ -837,13 +867,13 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); - mutex_lock(&cgroup_mutex); + cgroup_lock(); ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) TRACE_CGROUP_PATH(rename, cgrp); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); @@ -865,6 +895,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo seq_puts(seq, ",xattr"); if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); + if (root->flags & CGRP_ROOT_FAVOR_DYNMODS) + seq_puts(seq, ",favordynmods"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) @@ -888,6 +920,8 @@ enum cgroup1_param { Opt_noprefix, Opt_release_agent, Opt_xattr, + Opt_favordynmods, + Opt_nofavordynmods, }; const struct fs_parameter_spec cgroup1_fs_parameters[] = { @@ -899,6 +933,8 @@ const struct fs_parameter_spec cgroup1_fs_parameters[] = { fsparam_flag ("noprefix", Opt_noprefix), fsparam_string("release_agent", Opt_release_agent), fsparam_flag ("xattr", Opt_xattr), + fsparam_flag ("favordynmods", Opt_favordynmods), + fsparam_flag ("nofavordynmods", Opt_nofavordynmods), {} }; @@ -917,7 +953,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) if (ret != -ENOPARAM) return ret; for_each_subsys(ss, i) { - if (strcmp(param->key, ss->legacy_name)) + if (strcmp(param->key, ss->legacy_name) || + cgroup1_subsys_absent(ss)) continue; if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) return invalfc(fc, "Disabled controller '%s'", @@ -950,10 +987,22 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_xattr: ctx->flags |= CGRP_ROOT_XATTR; break; + case Opt_favordynmods: + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + break; + case Opt_nofavordynmods: + ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; + break; case Opt_release_agent: /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) + return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; @@ -997,7 +1046,8 @@ static int check_cgroupfs_options(struct fs_context *fc) mask = ~((u16)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && + !cgroup1_subsys_absent(ss)) enabled |= 1 << i; ctx->subsys_mask &= enabled; @@ -1084,14 +1134,14 @@ int cgroup1_reconfigure(struct fs_context *fc) if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, ctx->release_agent); + strscpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -1195,8 +1245,11 @@ static int cgroup1_root_to_use(struct fs_context *fc) init_cgroup_root(ctx); ret = cgroup_setup_root(root, ctx->subsys_mask); - if (ret) + if (!ret) + cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS); + else cgroup_free_root(root); + return ret; } @@ -1215,7 +1268,7 @@ int cgroup1_get_tree(struct fs_context *fc) if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) ret = 1; /* restart */ - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); if (!ret) ret = cgroup_do_get_tree(fc); @@ -1232,6 +1285,40 @@ int cgroup1_get_tree(struct fs_context *fc) return ret; } +/** + * task_get_cgroup1 - Acquires the associated cgroup of a task within a + * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its + * hierarchy ID. + * @tsk: The target task + * @hierarchy_id: The ID of a cgroup1 hierarchy + * + * On success, the cgroup is returned. On failure, ERR_PTR is returned. + * We limit it to cgroup1 only. + */ +struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id) +{ + struct cgroup *cgrp = ERR_PTR(-ENOENT); + struct cgroup_root *root; + unsigned long flags; + + rcu_read_lock(); + for_each_root(root) { + /* cgroup1 only*/ + if (root == &cgrp_dfl_root) + continue; + if (root->hierarchy_id != hierarchy_id) + continue; + spin_lock_irqsave(&css_set_lock, flags); + cgrp = task_cgroup_from_root(tsk, root); + if (!cgrp || !cgroup_tryget(cgrp)) + cgrp = ERR_PTR(-ENOENT); + spin_unlock_irqrestore(&css_set_lock, flags); + break; + } + rcu_read_unlock(); + return cgrp; +} + static int __init cgroup1_wq_init(void) { /* @@ -1239,7 +1326,7 @@ static int __init cgroup1_wq_init(void) * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); + WQ_PERCPU, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } @@ -1271,8 +1358,15 @@ static int __init cgroup_no_v1(char *str) continue; cgroup_no_v1_mask |= 1 << i; + break; } } return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); + +static int __init cgroup_v1_proc(char *str) +{ + return (kstrtobool(str, &proc_show_all) == 0); +} +__setup("cgroup_v1_proc=", cgroup_v1_proc); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b31e1465868a..e717208cfb18 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -57,7 +57,10 @@ #include <linux/file.h> #include <linux/fs_parser.h> #include <linux/sched/cputime.h> +#include <linux/sched/deadline.h> #include <linux/psi.h> +#include <linux/nstree.h> +#include <linux/irq_work.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -89,14 +92,17 @@ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); -#ifdef CONFIG_PROVE_RCU +#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif +struct blocking_notifier_head cgroup_lifetime_notifier = + BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier); + DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; -bool cgroup_debug __read_mostly; +static bool cgroup_debug __read_mostly; /* * Protects cgroup_idr and css_idr so that IDs can be released without @@ -120,10 +126,33 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup - * destruction work items don't end up filling up max_active of system_wq + * destruction work items don't end up filling up max_active of system_percpu_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -160,17 +189,21 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS -static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); +static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu); +static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu); /* the default hierarchy */ -struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; +struct cgroup_root cgrp_dfl_root = { + .cgrp.self.rstat_cpu = &root_rstat_cpu, + .cgrp.rstat_base_cpu = &root_rstat_base_cpu, +}; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_visible; +bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; @@ -206,17 +239,26 @@ static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; +static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); + +/* + * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem, + * read protected by either. + * + * Can only be turned on, but not turned off. + */ +bool cgroup_enable_per_threadgroup_rwsem __read_mostly; + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_cgroup_ns), .user_ns = &init_user_ns, - .ns.ops = &cgroupns_operations, - .ns.inum = PROC_CGROUP_INIT_INO, .root_cset = &init_css_set, }; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; +static struct cftype cgroup_psi_files[]; /* cgroup optional features */ enum cgroup_opt_features { @@ -246,6 +288,13 @@ static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static void cgroup_rt_init(void); + +#ifdef CONFIG_DEBUG_CGROUP_REF +#define CGROUP_REF_FN_ATTRS noinline +#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn); +#include <linux/cgroup_refcnt.h> +#endif /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID @@ -279,8 +328,6 @@ bool cgroup_ssid_enabled(int ssid) * * - When mounting an existing superblock, mount options should match. * - * - Remount is disallowed. - * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use @@ -307,8 +354,6 @@ bool cgroup_ssid_enabled(int ssid) * masks of ancestors. * * - blkcg: blk-throttle becomes properly hierarchical. - * - * - debug: disallowed on the default hierarchy. */ bool cgroup_on_dfl(const struct cgroup *cgrp) { @@ -351,7 +396,7 @@ static bool cgroup_has_tasks(struct cgroup *cgrp) return cgrp->nr_populated_csets; } -bool cgroup_is_threaded(struct cgroup *cgrp) +static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } @@ -390,7 +435,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) } /* is @cgrp root of a threaded subtree? */ -bool cgroup_is_thread_root(struct cgroup *cgrp) +static bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) @@ -489,28 +534,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, } /** - * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest - * - * Find and get @cgrp's css associated with @ss. If the css doesn't exist - * or is offline, %NULL is returned. - */ -static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - css = cgroup_css(cgrp, ss); - if (css && !css_tryget_online(css)) - css = NULL; - rcu_read_unlock(); - - return css; -} - -/** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) @@ -613,7 +636,7 @@ EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); - css_get(&cgrp->self); + cgroup_get(cgrp); } /** @@ -649,9 +672,22 @@ int cgroup_task_count(const struct cgroup *cgrp) return count; } +static struct cgroup *kn_priv(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + /* + * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT. + * Therefore it is always safe to dereference this pointer outside of a + * RCU section. + */ + parent = rcu_dereference_check(kn->__parent, + kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT); + return parent->priv; +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { - struct cgroup *cgrp = of->kn->parent->priv; + struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); /* @@ -675,7 +711,7 @@ EXPORT_SYMBOL_GPL(of_css); * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_[tree_]mutex. + * Should be called under cgroup_mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ @@ -685,21 +721,6 @@ EXPORT_SYMBOL_GPL(of_css); else /** - * for_each_e_css - iterate all effective css's of a cgroup - * @css: the iteration cursor - * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end - * @cgrp: the target cgroup to iterate css's of - * - * Should be called under cgroup_[tree_]mutex. - */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css_by_mask(cgrp, \ - cgroup_subsys[(ssid)]))) \ - ; \ - else - -/** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end @@ -765,7 +786,8 @@ struct css_set init_css_set = { .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), - .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), + .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), + .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* @@ -921,7 +943,8 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit()/cgroup_free() dropping the css_set. + * against cgroup_task_dead()/cgroup_task_free() dropping + * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -939,7 +962,7 @@ static void css_set_move_task(struct task_struct *task, #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state **css) { unsigned long key = 0UL; struct cgroup_subsys *ss; @@ -1080,7 +1103,7 @@ static bool compare_css_sets(struct css_set *cset, */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, - struct cgroup_subsys_state *template[]) + struct cgroup_subsys_state **template) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; @@ -1240,7 +1263,8 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); - INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_src_preload_node); + INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in @@ -1302,11 +1326,41 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { - struct cgroup *root_cgrp = kf_root->kn->priv; + struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv; return root_cgrp->root; } +void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) +{ + bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; + + /* + * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition. + * favordynmods can flip while task is between + * cgroup_threadgroup_change_begin() and end(), so down_write global + * cgroup_threadgroup_rwsem to synchronize them. + * + * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding + * cgroup_threadgroup_rwsem doesn't exlude tasks between + * cgroup_thread_group_change_begin() and end() and thus it's unsafe to + * turn off. As the scenario is unlikely, simply disallow disabling once + * enabled and print out a warning. + */ + percpu_down_write(&cgroup_threadgroup_rwsem); + if (favor && !favoring) { + cgroup_enable_per_threadgroup_rwsem = true; + rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); + root->flags |= CGRP_ROOT_FAVOR_DYNMODS; + } else if (!favor && favoring) { + if (cgroup_enable_per_threadgroup_rwsem) + pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n"); + rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); + root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; + } + percpu_up_write(&cgroup_threadgroup_rwsem); +} + static int cgroup_init_root_id(struct cgroup_root *root) { int id; @@ -1330,13 +1384,14 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - kfree(root); + kfree_rcu(root, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) { struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; + int ret; trace_cgroup_destroy_root(root); @@ -1345,6 +1400,10 @@ static void cgroup_destroy_root(struct cgroup_root *root) BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_OFFLINE, cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); + /* Rebind all subsystems back to the default hierarchy */ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); @@ -1362,90 +1421,129 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; - } + WARN_ON_ONCE(list_empty(&root->root_list)); + list_del_rcu(&root->root_list); + cgroup_root_count--; + + if (!have_favordynmods) + cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); - cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } /* - * look up cgroup associated with current task's cgroup namespace on the - * specified hierarchy + * Returned cgroup is without refcount but it's valid as long as cset pins it. */ -static struct cgroup * -current_cgns_cgroup_from_root(struct cgroup_root *root) +static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) { - struct cgroup *res = NULL; - struct css_set *cset; - - lockdep_assert_held(&css_set_lock); + struct cgroup *res_cgroup = NULL; - rcu_read_lock(); - - cset = current->nsproxy->cgroup_ns->root_cset; if (cset == &init_css_set) { - res = &root->cgrp; + res_cgroup = &root->cgrp; } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; + res_cgroup = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; + lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == root) { - res = c; + res_cgroup = c; break; } } } - rcu_read_unlock(); - BUG_ON(!res); - return res; + /* + * If cgroup_mutex is not held, the cgrp_cset_link will be freed + * before we remove the cgroup root from the root_list. Consequently, + * when accessing a cgroup root, the cset_link may have already been + * freed, resulting in a NULL res_cgroup. However, by holding the + * cgroup_mutex, we ensure that res_cgroup can't be NULL. + * If we don't hold cgroup_mutex in the caller, we must do the NULL + * check. + */ + return res_cgroup; } -/* look up cgroup associated with given css_set on the specified hierarchy */ -static struct cgroup *cset_cgroup_from_root(struct css_set *cset, - struct cgroup_root *root) +/* + * look up cgroup associated with current task's cgroup namespace on the + * specified hierarchy + */ +static struct cgroup * +current_cgns_cgroup_from_root(struct cgroup_root *root) { struct cgroup *res = NULL; + struct css_set *cset; - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); - if (cset == &init_css_set) { - res = &root->cgrp; - } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; - } else { - struct cgrp_cset_link *link; + rcu_read_lock(); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; + cset = current->nsproxy->cgroup_ns->root_cset; + res = __cset_cgroup_from_root(cset, root); - if (c->root == root) { - res = c; - break; - } - } - } + rcu_read_unlock(); - BUG_ON(!res); + /* + * The namespace_sem is held by current, so the root cgroup can't + * be umounted. Therefore, we can ensure that the res is non-NULL. + */ + WARN_ON_ONCE(!res); return res; } /* + * Look up cgroup associated with current task's cgroup namespace on the default + * hierarchy. + * + * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: + * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu + * pointers. + * - css_set_lock is not needed because we just read cset->dfl_cgrp. + * - As a bonus returned cgrp is pinned with the current because it cannot + * switch cgroup_ns asynchronously. + */ +static struct cgroup *current_cgns_cgroup_dfl(void) +{ + struct css_set *cset; + + if (current->nsproxy) { + cset = current->nsproxy->cgroup_ns->root_cset; + return __cset_cgroup_from_root(cset, &cgrp_dfl_root); + } else { + /* + * NOTE: This function may be called from bpf_cgroup_from_id() + * on a task which has already passed exit_nsproxy_namespaces() + * and nsproxy == NULL. Fall back to cgrp_dfl_root which will + * make all cgroups visible for lookups. + */ + return &cgrp_dfl_root.cgrp; + } +} + +/* look up cgroup associated with given css_set on the specified hierarchy */ +static struct cgroup *cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) +{ + lockdep_assert_held(&css_set_lock); + + return __cset_cgroup_from_root(cset, root); +} + +/* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_lock held. + * called with css_set_lock held to prevent task's groups from being modified. + * Must be called with either cgroup_mutex or rcu read lock to prevent the + * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -1587,9 +1685,9 @@ void cgroup_kn_unlock(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else - cgrp = kn->parent->priv; + cgrp = kn_priv(kn); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); @@ -1619,7 +1717,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else - cgrp = kn->parent->priv; + cgrp = kn_priv(kn); /* * We're gonna grab cgroup_mutex which nests outside kernfs @@ -1634,7 +1732,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) if (drain_offline) cgroup_lock_and_drain_offline(cgrp); else - mutex_lock(&cgroup_mutex); + cgroup_lock(); if (!cgroup_is_dead(cgrp)) return cgrp; @@ -1657,7 +1755,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); - del_timer_sync(&cfile->notify_timer); + timer_delete_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); @@ -1677,13 +1775,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - if (!css->ss) { - if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; - else - cfts = cgroup1_base_files; - - cgroup_addrm_files(css, cgrp, cfts, false); + if (css_is_self(css)) { + if (cgroup_on_dfl(cgrp)) { + cgroup_addrm_files(css, cgrp, + cgroup_base_files, false); + if (cgroup_psi_enabled()) + cgroup_addrm_files(css, cgrp, + cgroup_psi_files, false); + } else { + cgroup_addrm_files(css, cgrp, + cgroup1_base_files, false); + } } else { list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); @@ -1702,18 +1804,31 @@ static int css_populate_dir(struct cgroup_subsys_state *css) struct cftype *cfts, *failed_cfts; int ret; - if ((css->flags & CSS_VISIBLE) || !cgrp->kn) + if (css->flags & CSS_VISIBLE) return 0; - if (!css->ss) { - if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; - else - cfts = cgroup1_base_files; - - ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); - if (ret < 0) - return ret; + if (css_is_self(css)) { + if (cgroup_on_dfl(cgrp)) { + ret = cgroup_addrm_files(css, cgrp, + cgroup_base_files, true); + if (ret < 0) + return ret; + + if (cgroup_psi_enabled()) { + ret = cgroup_addrm_files(css, cgrp, + cgroup_psi_files, true); + if (ret < 0) { + cgroup_addrm_files(css, cgrp, + cgroup_base_files, false); + return ret; + } + } + } else { + ret = cgroup_addrm_files(css, cgrp, + cgroup1_base_files, true); + if (ret < 0) + return ret; + } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { ret = cgroup_addrm_files(css, cgrp, cfts, true); @@ -1740,7 +1855,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - int ssid, i, ret; + int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1784,7 +1899,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); - struct css_set *cset; + struct css_set *cset, *cset_pos; + struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); @@ -1799,19 +1915,26 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; - css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) + css->cgroup = dcgrp; + WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); + list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], + e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); - spin_unlock_irq(&css_set_lock); - - if (ss->css_rstat_flush) { - list_del_rcu(&css->rstat_css_node); - list_add_rcu(&css->rstat_css_node, - &dcgrp->rstat_css_list); + /* + * all css_sets of scgrp together in same order to dcgrp, + * patch in-flight iterators to preserve correct iteration. + * since the iterator is always advanced right away and + * finished when it->cset_pos meets it->cset_head, so only + * update it->cset_head is enough here. + */ + list_for_each_entry(it, &cset->task_iters, iters_node) + if (it->cset_head == &scgrp->e_csets[ss->id]) + it->cset_head = &dcgrp->e_csets[ss->id]; } + spin_unlock_irq(&css_set_lock); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; @@ -1852,7 +1975,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); spin_unlock_irq(&css_set_lock); - if (len >= PATH_MAX) + if (len == -E2BIG) len = -ERANGE; else if (len > 0) { seq_escape(sf, buf, " \t\n\\"); @@ -1864,15 +1987,21 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, enum cgroup2_param { Opt_nsdelegate, + Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, + Opt_memory_hugetlb_accounting, + Opt_pids_localevents, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), + fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), + fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), + fsparam_flag("pids_localevents", Opt_pids_localevents), {} }; @@ -1890,16 +2019,32 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; + case Opt_favordynmods: + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; + case Opt_memory_hugetlb_accounting: + ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + return 0; + case Opt_pids_localevents: + ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + return 0; } return -EINVAL; } +struct cgroup_of_peak *of_peak(struct kernfs_open_file *of) +{ + struct cgroup_file_ctx *ctx = of->priv; + + return &ctx->peak; +} + static void apply_cgroup_root_flags(unsigned int root_flags) { if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { @@ -1908,6 +2053,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags) else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + cgroup_favor_dynmods(&cgrp_dfl_root, + root_flags & CGRP_ROOT_FAVOR_DYNMODS); + if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; else @@ -1917,6 +2065,16 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; + + if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + + if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS; } } @@ -1924,10 +2082,16 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS) + seq_puts(seq, ",favordynmods"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + seq_puts(seq, ",memory_hugetlb_accounting"); + if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + seq_puts(seq, ",pids_localevents"); return 0; } @@ -1954,12 +2118,16 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; - INIT_LIST_HEAD(&cgrp->rstat_css_list); prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); +#ifdef CONFIG_CGROUP_BPF + for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++) + cgrp->bpf.revisions[i] = 1; +#endif + init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -1969,12 +2137,13 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; - INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); - root->flags = ctx->flags; + /* DYNMODS must be modified through cgroup_favor_dynmods() */ + root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent) strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name) @@ -2019,21 +2188,22 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP | - KERNFS_ROOT_SUPPORT_USER_XATTR, + KERNFS_ROOT_SUPPORT_USER_XATTR | + KERNFS_ROOT_INVARIANT_PARENT, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); goto exit_root_id; } - root_cgrp->kn = root->kf_root->kn; + root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); - root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); + root_cgrp->ancestors[0] = root_cgrp; ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; - ret = cgroup_rstat_init(root_cgrp); + ret = css_rstat_init(&root_cgrp->self); if (ret) goto destroy_root; @@ -2041,8 +2211,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto exit_stats; - ret = cgroup_bpf_inherit(root_cgrp); - WARN_ON_ONCE(ret); + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_ONLINE, root_cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); trace_cgroup_setup_root(root); @@ -2051,7 +2222,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ - list_add(&root->root_list, &cgroup_roots); + list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* @@ -2073,7 +2244,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto out; exit_stats: - cgroup_rstat_exit(root_cgrp); + css_rstat_exit(&root_cgrp->self); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -2107,13 +2278,13 @@ int cgroup_do_get_tree(struct fs_context *fc) struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; - mutex_lock(&cgroup_mutex); + cgroup_lock(); spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(fc->root); @@ -2150,7 +2321,7 @@ static int cgroup_get_tree(struct fs_context *fc) struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; - cgrp_dfl_visible = true; + WRITE_ONCE(cgrp_dfl_visible, true); cgroup_get_live(&cgrp_dfl_root.cgrp); ctx->root = &cgrp_dfl_root; @@ -2196,6 +2367,10 @@ static int cgroup_init_fs_context(struct fs_context *fc) put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; + + if (have_favordynmods) + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + return 0; } @@ -2211,10 +2386,8 @@ static void cgroup_kill_sb(struct super_block *sb) * And don't kill the default root. */ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && - !percpu_ref_is_dying(&root->cgrp.self.refcnt)) { - cgroup_bpf_offline(&root->cgrp); + !percpu_ref_is_dying(&root->cgrp.self.refcnt)) percpu_ref_kill(&root->cgrp.self.refcnt); - } cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } @@ -2235,10 +2408,38 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 +enum cpuset_param { + Opt_cpuset_v2_mode, +}; + +static const struct fs_parameter_spec cpuset_fs_parameters[] = { + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + {} +}; + +static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, cpuset_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + return 0; + } + return -EINVAL; +} + static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, + .parse_param = cpuset_parse_param, }; /* @@ -2275,6 +2476,7 @@ static int cpuset_init_fs_context(struct fs_context *fc) static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, + .parameters = cpuset_fs_parameters, .fs_flags = FS_USERNS_MOUNT, }; #endif @@ -2292,56 +2494,92 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, { int ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } EXPORT_SYMBOL_GPL(cgroup_path_ns); /** - * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy - * @task: target task - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Determine @task's cgroup on the first (the one with the lowest non-zero - * hierarchy_id) cgroup hierarchy and copy its path into @buf. This - * function grabs cgroup_mutex and shouldn't be used inside locks used by - * cgroup controller callbacks. - * - * Return value is the same as kernfs_path(). + * cgroup_attach_lock - Lock for ->attach() + * @lock_mode: whether acquire and acquire which rwsem + * @tsk: thread group to lock + * + * cgroup migration sometimes needs to stabilize threadgroups against forks and + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() + * implementations (e.g. cpuset), also need to disable CPU hotplug. + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can + * lead to deadlocks. + * + * Bringing up a CPU may involve creating and destroying tasks which requires + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while + * write-locking threadgroup_rwsem, the locking order is reversed and we end up + * waiting for an on-going CPU hotplug operation which in turn is waiting for + * the threadgroup_rwsem to be released to create new tasks. For more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + * + * Resolve the situation by always acquiring cpus_read_lock() before optionally + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that + * CPU hotplug is disabled on entry. + * + * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead + * on dynamic cgroup modifications. see the comment above + * CGRP_ROOT_FAVOR_DYNMODS definition. + * + * tsk is not NULL only when writing to cgroup.procs. */ -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { - struct cgroup_root *root; - struct cgroup *cgrp; - int hierarchy_id = 1; - int ret; - - mutex_lock(&cgroup_mutex); - spin_lock_irq(&css_set_lock); + cpus_read_lock(); - root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: + percpu_down_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + down_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } +} - if (root) { - cgrp = task_cgroup_from_root(task, root); - ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); - } else { - /* if no hierarchy exists, everyone is in "/" */ - ret = strlcpy(buf, "/", buflen); +/** + * cgroup_attach_unlock - Undo cgroup_attach_lock() + * @lock_mode: whether release and release which rwsem + * @tsk: thread group to lock + */ +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) +{ + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: + percpu_up_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + up_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; } - spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); - return ret; + cpus_read_unlock(); } -EXPORT_SYMBOL_GPL(task_cgroup_path); /** * cgroup_migrate_add_task - add a migration target task to a migration context @@ -2425,7 +2663,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /* * This function may be called both before and - * after cgroup_taskset_migrate(). The two cases + * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ @@ -2570,10 +2808,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP; - /* mixables don't care */ - if (cgroup_is_mixable(dst_cgrp)) - return 0; - /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. @@ -2597,21 +2831,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { - LIST_HEAD(preloaded); struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); - list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, + mg_src_preload_node) { + cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; + cset->mg_dst_cset = NULL; + list_del_init(&cset->mg_src_preload_node); + put_css_set_locked(cset); + } - list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, + mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; - list_del_init(&cset->mg_preload_node); + list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } @@ -2651,7 +2891,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - if (!list_empty(&src_cset->mg_preload_node)) + if (!list_empty(&src_cset->mg_src_preload_node)) return; src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); @@ -2664,7 +2904,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); + list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** @@ -2689,7 +2929,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, - mg_preload_node) { + mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; @@ -2708,7 +2948,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; - list_del_init(&src_cset->mg_preload_node); + list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; @@ -2716,8 +2956,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) src_cset->mg_dst_cset = dst_cset; - if (list_empty(&dst_cset->mg_preload_node)) - list_add_tail(&dst_cset->mg_preload_node, + if (list_empty(&dst_cset->mg_dst_preload_node)) + list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); @@ -2754,19 +2994,17 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct task_struct *task; /* - * Prevent freeing of tasks while we take a snapshot. Tasks that are - * already PF_EXITING could be freed from underneath us unless we - * take an rcu_read_lock. + * The following thread iteration should be inside an RCU critical + * section to prevent tasks from being freed while taking the snapshot. + * spin_lock_irq() implies RCU critical section here. */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); return cgroup_migrate_execute(mgctx); @@ -2789,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, /* look up all src csets */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ @@ -2813,8 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) - __acquires(&cgroup_threadgroup_rwsem) + enum cgroup_attach_lock_mode *lock_mode) { struct task_struct *tsk; pid_t pid; @@ -2822,28 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - /* - * If we migrate a single thread, we don't care about threadgroup - * stability. If the thread is `current`, it won't exit(2) under our - * hands or change PID through exec(2). We exclude - * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write - * callers by cgroup_mutex. - * Therefore, we can skip the global lock. - */ - lockdep_assert_held(&cgroup_mutex); - if (pid || threadgroup) { - percpu_down_write(&cgroup_threadgroup_rwsem); - *locked = true; - } else { - *locked = false; - } - +retry_find_task: rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } } else { tsk = current; @@ -2860,36 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } - get_task_struct(tsk); - goto out_unlock_rcu; + rcu_read_unlock(); -out_unlock_threadgroup: - if (*locked) { - percpu_up_write(&cgroup_threadgroup_rwsem); - *locked = false; + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers + * by cgroup_mutex. Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + + if (pid || threadgroup) { + if (cgroup_enable_per_threadgroup_rwsem) + *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP; + else + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + } else { + *lock_mode = CGRP_ATTACH_LOCK_NONE; + } + + cgroup_attach_lock(*lock_mode, tsk); + + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * A race with de_thread from another thread's exec() + * may strip us of our leadership. If this happens, + * throw this task away and try again. + */ + cgroup_attach_unlock(*lock_mode, tsk); + put_task_struct(tsk); + goto retry_find_task; + } } + + return tsk; + out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool locked) - __releases(&cgroup_threadgroup_rwsem) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) { - struct cgroup_subsys *ss; - int ssid; + cgroup_attach_unlock(lock_mode, task); /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - - if (locked) - percpu_up_write(&cgroup_threadgroup_rwsem); - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -2941,29 +3183,54 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + enum cgroup_attach_lock_mode lock_mode; + bool has_tasks; int ret; lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; + /* + * As cgroup_update_dfl_csses() is only called by + * cgroup_apply_control(). The csses associated with the + * given cgrp will not be affected by changes made to + * its subtree_control file. We can skip them. + */ + if (dsct == cgrp) + continue; + list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); + /* + * We need to write-lock threadgroup_rwsem while migrating tasks. + * However, if there are no source csets for @cgrp, changing its + * controllers isn't gonna produce any task migrations and the + * write-locking can be skipped safely. + */ + has_tasks = !list_empty(&mgctx.preloaded_src_csets); + + if (has_tasks) + lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(lock_mode, NULL); + /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { + list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, + mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ @@ -2975,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(lock_mode, NULL); return ret; } @@ -2996,7 +3263,7 @@ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) int ssid; restart: - mutex_lock(&cgroup_mutex); + cgroup_lock(); cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { @@ -3010,7 +3277,7 @@ restart: prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); schedule(); finish_wait(&dsct->offline_waitq, &wait); @@ -3212,11 +3479,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ - ret = cgroup_update_dfl_csses(cgrp); - if (ret) - return ret; - - return 0; + return cgroup_update_dfl_csses(cgrp); } /** @@ -3565,18 +3828,90 @@ static int cgroup_events_show(struct seq_file *seq, void *v) static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; + struct cgroup_subsys_state *css; + int dying_cnt[CGROUP_SUBSYS_COUNT]; + int ssid; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); + + /* + * Show the number of live and dying csses associated with each of + * non-inhibited cgroup subsystems that is bound to cgroup v2. + * + * Without proper lock protection, racing is possible. So the + * numbers may not be consistent when that happens. + */ + rcu_read_lock(); + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + dying_cnt[ssid] = -1; + if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) || + (cgroup_subsys[ssid]->root != &cgrp_dfl_root)) + continue; + css = rcu_dereference_raw(cgroup->subsys[ssid]); + dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid]; + seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name, + css ? (css->nr_descendants + 1) : 0); + } + seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + if (dying_cnt[ssid] >= 0) + seq_printf(seq, "nr_dying_subsys_%s %d\n", + cgroup_subsys[ssid]->name, dying_cnt[ssid]); + } + rcu_read_unlock(); + return 0; +} + +static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + unsigned int sequence; + u64 freeze_time; + + do { + sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); + freeze_time = cgrp->freezer.frozen_nsec; + /* Add in current freezer interval if the cgroup is freezing. */ + if (test_bit(CGRP_FREEZE, &cgrp->flags)) + freeze_time += (ktime_get_ns() - + cgrp->freezer.freeze_start_nsec); + } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); + + do_div(freeze_time, NSEC_PER_USEC); + seq_printf(seq, "frozen_usec %llu\n", freeze_time); return 0; } -static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, - struct cgroup *cgrp, int ssid) +#ifdef CONFIG_CGROUP_SCHED +/** + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get @cgrp's css associated with @ss. If the css doesn't exist + * or is offline, %NULL is returned. + */ +static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css = cgroup_css(cgrp, ss); + if (css && !css_tryget_online(css)) + css = NULL; + rcu_read_unlock(); + + return css; +} + +static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) { + struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; @@ -3593,14 +3928,44 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, return ret; } +static int cgroup_local_stat_show(struct seq_file *seq, + struct cgroup *cgrp, int ssid) +{ + struct cgroup_subsys *ss = cgroup_subsys[ssid]; + struct cgroup_subsys_state *css; + int ret; + + if (!ss->css_local_stat_show) + return 0; + + css = cgroup_tryget_css(cgrp, ss); + if (!css) + return 0; + + ret = ss->css_local_stat_show(seq, css); + css_put(css); + return ret; +} +#endif + static int cpu_stat_show(struct seq_file *seq, void *v) { - struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED - ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); + ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); +#endif + return ret; +} + +static int cpu_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; + int ret = 0; + +#ifdef CONFIG_CGROUP_SCHED + ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id); #endif return ret; } @@ -3609,27 +3974,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_CPU); } -static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, enum psi_res res) +static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) { struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; @@ -3643,15 +4008,20 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); - psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; - new = psi_trigger_create(psi, buf, nbytes, res); + /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + + psi = cgroup_psi(cgrp); + new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); } - psi_trigger_replace(&ctx->psi.trigger, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; @@ -3661,21 +4031,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_IO); + return pressure_write(of, buf, nbytes, PSI_IO); } static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); + return pressure_write(of, buf, nbytes, PSI_MEM); } static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); + return pressure_write(of, buf, nbytes, PSI_CPU); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_psi(cgrp); + + return psi_show(seq, psi, PSI_IRQ); +} + +static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return pressure_write(of, buf, nbytes, PSI_IRQ); +} +#endif + +static int cgroup_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_psi(cgrp); + + seq_printf(seq, "%d\n", psi->enabled); + + return 0; +} + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + ssize_t ret; + int enable; + struct cgroup *cgrp; + struct psi_group *psi; + + ret = kstrtoint(strstrip(buf), 0, &enable); + if (ret) + return ret; + + if (enable < 0 || enable > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + psi = cgroup_psi(cgrp); + if (psi->enabled != enable) { + int i; + + /* show or hide {cpu,memory,io,irq}.pressure files */ + for (i = 0; i < NR_PSI_RESOURCES; i++) + cgroup_file_show(&cgrp->psi_files[i], enable); + + psi->enabled = enable; + if (enable) + psi_cgroup_restart(psi); + } + + cgroup_kn_unlock(of->kn); + + return nbytes; } static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, @@ -3690,11 +4125,14 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; - psi_trigger_replace(&ctx->psi.trigger, NULL); + psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) { + if (static_branch_likely(&psi_disabled)) + return false; + return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } @@ -3748,7 +4186,7 @@ static void __cgroup_kill(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - set_bit(CGRP_KILL, &cgrp->flags); + cgrp->kill_seq++; spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); @@ -3764,10 +4202,6 @@ static void __cgroup_kill(struct cgroup *cgrp) send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); - - spin_lock_irq(&css_set_lock); - clear_bit(CGRP_KILL, &cgrp->flags); - spin_unlock_irq(&css_set_lock); } static void cgroup_kill(struct cgroup *cgrp) @@ -3848,13 +4282,14 @@ static void cgroup_file_release(struct kernfs_open_file *of) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); + of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; - struct cgroup *cgrp = of->kn->parent->priv; + struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; @@ -3866,7 +4301,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - - * cgroup.procs and cgroup.subtree_control. + * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && @@ -3967,20 +4402,6 @@ static struct kernfs_ops cgroup_kf_ops = { .seq_show = cgroup_seqfile_show, }; -/* set uid and gid of cgroup dirs and files to that of the creator */ -static int cgroup_kn_set_ugid(struct kernfs_node *kn) -{ - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = current_fsuid(), - .ia_gid = current_fsgid(), }; - - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) - return 0; - - return kernfs_setattr(kn, &iattr); -} - static void cgroup_file_notify_timer(struct timer_list *timer) { cgroup_file_notify(container_of(timer, struct cgroup_file, @@ -3993,25 +4414,18 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; - int ret; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_fsuid(), current_fsgid(), 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); - ret = cgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; @@ -4047,8 +4461,6 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ - if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) - continue; if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) @@ -4113,21 +4525,25 @@ static void cgroup_exit_cftypes(struct cftype *cfts) cft->ss = NULL; /* revert flags set by cgroup core while adding @cfts */ - cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); + cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL | + __CFTYPE_ADDED); } } static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; + int ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; WARN_ON(cft->ss || cft->kf_ops); - if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) - continue; + if (cft->flags & __CFTYPE_ADDED) { + ret = -EBUSY; + break; + } if (cft->seq_start) kf_ops = &cgroup_kf_ops; @@ -4141,30 +4557,29 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); if (!kf_ops) { - cgroup_exit_cftypes(cfts); - return -ENOMEM; + ret = -ENOMEM; + break; } kf_ops->atomic_write_len = cft->max_write_len; } cft->kf_ops = kf_ops; cft->ss = ss; + cft->flags |= __CFTYPE_ADDED; } - return 0; + if (ret) + cgroup_exit_cftypes(cfts); + return ret; } -static int cgroup_rm_cftypes_locked(struct cftype *cfts) +static void cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); - if (!cfts || !cfts[0].ss) - return -ENOENT; - list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); - return 0; } /** @@ -4180,12 +4595,16 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts) */ int cgroup_rm_cftypes(struct cftype *cfts) { - int ret; + if (!cfts || cfts[0].name[0] == '\0') + return 0; - mutex_lock(&cgroup_mutex); - ret = cgroup_rm_cftypes_locked(cfts); - mutex_unlock(&cgroup_mutex); - return ret; + if (!(cfts[0].flags & __CFTYPE_ADDED)) + return -ENOENT; + + cgroup_lock(); + cgroup_rm_cftypes_locked(cfts); + cgroup_unlock(); + return 0; } /** @@ -4202,7 +4621,7 @@ int cgroup_rm_cftypes(struct cftype *cfts) * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; @@ -4216,14 +4635,14 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (ret) return ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -4285,6 +4704,27 @@ void cgroup_file_notify(struct cgroup_file *cfile) } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +EXPORT_SYMBOL_GPL(cgroup_file_notify); + +/** + * cgroup_file_show - show or hide a hidden cgroup file + * @cfile: target cgroup_file obtained by setting cftype->file_offset + * @show: whether to show or hide + */ +void cgroup_file_show(struct cgroup_file *cfile, bool show) +{ + struct kernfs_node *kn; + + spin_lock_irq(&cgroup_file_kn_lock); + kn = cfile->kn; + kernfs_get(kn); + spin_unlock_irq(&cgroup_file_kn_lock); + + if (kn) + kernfs_show(kn, show); + + kernfs_put(kn); +} /** * css_next_child - find the next child of a given css @@ -4361,8 +4801,9 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct next descendant as long - * as both @pos and @root are accessible and @pos is a descendant of @root. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct next descendant as long as both @pos + * and @root are accessible and @pos is a descendant of @root. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the @@ -4410,8 +4851,9 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre); * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct rightmost descendant as - * long as @pos is accessible. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct rightmost descendant as long as @pos + * is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) @@ -4455,9 +4897,9 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct next descendant as long - * as both @pos and @cgroup are accessible and @pos is a descendant of - * @cgroup. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the @@ -4700,9 +5142,11 @@ repeat: void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { + unsigned long irqflags; + memset(it, 0, sizeof(*it)); - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); it->ss = css->ss; it->flags = flags; @@ -4716,7 +5160,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, css_task_iter_advance(it); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); } /** @@ -4729,12 +5173,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { + unsigned long irqflags; + if (it->cur_task) { put_task_struct(it->cur_task); it->cur_task = NULL; } - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); /* @it may be half-advanced by skips, finish advancing */ if (it->flags & CSS_TASK_ITER_SKIPPED) @@ -4747,7 +5193,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) css_task_iter_advance(it); } - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); return it->cur_task; } @@ -4760,11 +5206,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) */ void css_task_iter_end(struct css_task_iter *it) { + unsigned long irqflags; + if (it->cur_cset) { - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); } if (it->cur_dcset) @@ -4853,7 +5301,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) if (!inode) return -ENOMEM; - ret = inode_permission(&init_user_ns, inode, MAY_WRITE); + ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE); iput(inode); return ret; } @@ -4916,15 +5364,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; - const struct cred *saved_cred; ssize_t ret; - bool locked; + enum cgroup_attach_lock_mode lock_mode; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4939,18 +5386,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, * permissions using the credentials from file open to protect against * inherited fd attacks. */ - saved_cred = override_creds(of->file->f_cred); - ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, - threadgroup, ctx->ns); - revert_creds(saved_cred); + scoped_with_creds(of->file->f_cred) + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); if (ret) goto out_finish; ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -5032,6 +5478,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_stat_show, }, { + .name = "cgroup.stat.local", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_core_local_stat_show, + }, + { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_freeze_show, @@ -5046,10 +5497,18 @@ static struct cftype cgroup_base_files[] = { .name = "cpu.stat", .seq_show = cpu_stat_show, }, + { + .name = "cpu.stat.local", + .seq_show = cpu_local_stat_show, + }, + { } /* terminate */ +}; + +static struct cftype cgroup_psi_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", - .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5057,7 +5516,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "memory.pressure", - .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5065,12 +5524,27 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.pressure", - .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif + { + .name = "cgroup.pressure", + .seq_show = cgroup_pressure_show, + .write = cgroup_pressure_write, + }, #endif /* CONFIG_PSI */ { } /* terminate */ }; @@ -5091,7 +5565,7 @@ static struct cftype cgroup_base_files[] = { * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in - * css_free_work_fn(). + * css_free_rwork_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional @@ -5105,8 +5579,9 @@ static void css_free_rwork_fn(struct work_struct *work) struct cgroup *cgrp = css->cgroup; percpu_ref_exit(&css->refcnt); + css_rstat_exit(css); - if (ss) { + if (!css_is_self(css)) { /* css free path */ struct cgroup_subsys_state *parent = css->parent; int id = css->id; @@ -5120,8 +5595,10 @@ static void css_free_rwork_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup1_pidlist_destroy_all(cgrp); + if (!cgroup_on_dfl(cgrp)) + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); + bpf_cgrp_storage_free(cgrp); if (cgroup_parent(cgrp)) { /* @@ -5133,7 +5610,6 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -5153,28 +5629,41 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; - mutex_lock(&cgroup_mutex); + cgroup_lock(); css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); - if (ss) { - /* css release path */ - if (!list_empty(&css->rstat_css_node)) { - cgroup_rstat_flush(cgrp); - list_del_rcu(&css->rstat_css_node); - } + if (!css_is_self(css)) { + struct cgroup *parent_cgrp; + + css_rstat_flush(css); cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); + + cgrp->nr_dying_subsys[ss->id]--; + /* + * When a css is released and ready to be freed, its + * nr_descendants must be zero. However, the corresponding + * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem + * is activated and deactivated multiple times with one or + * more of its previous activation leaving behind dying csses. + */ + WARN_ON_ONCE(css->nr_descendants); + parent_cgrp = cgroup_parent(cgrp); + while (parent_cgrp) { + parent_cgrp->nr_dying_subsys[ss->id]--; + parent_cgrp = cgroup_parent(parent_cgrp); + } } else { struct cgroup *tcgrp; /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - cgroup_rstat_flush(cgrp); + css_rstat_flush(&cgrp->self); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5194,10 +5683,10 @@ static void css_release_work_fn(struct work_struct *work) NULL); } - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5206,7 +5695,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5222,7 +5711,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); - INIT_LIST_HEAD(&css->rstat_css_node); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); @@ -5231,9 +5719,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } - if (ss->css_rstat_flush) - list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); - BUG_ON(cgroup_css(cgrp, ss)); } @@ -5252,8 +5737,11 @@ static int online_css(struct cgroup_subsys_state *css) rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); - if (css->parent) + if (css->parent) { atomic_inc(&css->parent->online_cnt); + while ((css = css->parent)) + css->nr_descendants++; + } } return ret; } @@ -5275,6 +5763,16 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); + + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } } /** @@ -5313,6 +5811,10 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, goto err_free_css; css->id = err; + err = css_rstat_init(css); + if (err) + goto err_free_css; + /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); @@ -5326,16 +5828,14 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); err_free_css: - list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } /* * The returned cgroup is fully initialized including its control mask, but - * it isn't associated with its kernfs_node and doesn't have the control - * mask applied. + * it doesn't have the control mask applied. */ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, umode_t mode) @@ -5343,12 +5843,11 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; struct kernfs_node *kn; - int level = parent->level + 1; + int i, level = parent->level + 1; int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)), - GFP_KERNEL); + cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); @@ -5356,15 +5855,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + kn = kernfs_create_dir_ns(parent->kn, name, mode, + current_fsuid(), current_fsgid(), + cgrp, NULL); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_stat_exit; + goto out_cancel_ref; } cgrp->kn = kn; @@ -5374,19 +5871,27 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, cgrp->root = root; cgrp->level = level; - ret = psi_cgroup_alloc(cgrp); + /* + * Now that init_cgroup_housekeeping() has been called and cgrp->self + * is setup, it is safe to perform rstat initialization on it. + */ + ret = css_rstat_init(&cgrp->self); if (ret) goto out_kernfs_remove; - ret = cgroup_bpf_inherit(cgrp); + ret = psi_cgroup_alloc(cgrp); if (ret) - goto out_psi_free; + goto out_stat_exit; + + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + cgrp->ancestors[tcgrp->level] = tcgrp; /* * New cgroup inherits effective freeze counter, and * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; + seqcount_spinlock_init(&cgrp->freezer.freeze_seq, &css_set_lock); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be @@ -5395,27 +5900,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.freeze_start_nsec = ktime_get_ns(); set_bit(CGRP_FROZEN, &cgrp->flags); } - spin_lock_irq(&css_set_lock); - for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); - - if (tcgrp != cgrp) { - tcgrp->nr_descendants++; - - /* - * If the new cgroup is frozen, all ancestor cgroups - * get a new frozen descendant, but their state can't - * change because of this. - */ - if (cgrp->freezer.e_freeze) - tcgrp->freezer.nr_frozen_descendants++; - } - } - spin_unlock_irq(&css_set_lock); - if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5424,7 +5912,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, cgrp->self.serial_nr = css_serial_nr_next++; + ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_ONLINE, + CGROUP_LIFETIME_OFFLINE, cgrp); + ret = notifier_to_errno(ret); + if (ret) + goto out_psi_free; + /* allocation complete, commit to creation */ + spin_lock_irq(&css_set_lock); + for (i = 0; i < level; i++) { + tcgrp = cgrp->ancestors[i]; + tcgrp->nr_descendants++; + + /* + * If the new cgroup is frozen, all ancestor cgroups get a new + * frozen descendant, but their state can't change because of + * this. + */ + if (cgrp->freezer.e_freeze) + tcgrp->freezer.nr_frozen_descendants++; + } + spin_unlock_irq(&css_set_lock); + list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); @@ -5442,10 +5952,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, out_psi_free: psi_cgroup_free(cgrp); +out_stat_exit: + css_rstat_exit(&cgrp->self); out_kernfs_remove: kernfs_remove(cgrp->kn); -out_stat_exit: - cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5457,7 +5967,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) { struct cgroup *cgroup; int ret = false; - int level = 1; + int level = 0; lockdep_assert_held(&cgroup_mutex); @@ -5465,7 +5975,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) if (cgroup->nr_descendants >= cgroup->max_descendants) goto fail; - if (level > cgroup->max_depth) + if (level >= cgroup->max_depth) goto fail; level++; @@ -5501,15 +6011,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) } /* - * This extra ref will be put in cgroup_free_fn() and guarantees + * This extra ref will be put in css_free_rwork_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(cgrp->kn); - ret = cgroup_kn_set_ugid(cgrp->kn); - if (ret) - goto out_destroy; - ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; @@ -5543,7 +6049,7 @@ static void css_killed_work_fn(struct work_struct *work) struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); - mutex_lock(&cgroup_mutex); + cgroup_lock(); do { offline_css(css); @@ -5552,7 +6058,7 @@ static void css_killed_work_fn(struct work_struct *work) css = css->parent; } while (css && atomic_dec_and_test(&css->online_cnt)); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); } /* css kill confirmation processing requires process context, bounce */ @@ -5563,7 +6069,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -5583,6 +6089,12 @@ static void kill_css(struct cgroup_subsys_state *css) if (css->flags & CSS_DYING) return; + /* + * Call css_killed(), if defined, before setting the CSS_DYING flag + */ + if (css->ss->css_killed) + css->ss->css_killed(css); + css->flags |= CSS_DYING; /* @@ -5640,7 +6152,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; - int ssid; + int ssid, ret; lockdep_assert_held(&cgroup_mutex); @@ -5662,7 +6174,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling - * cgroup_lock_live_group(). The latter makes the csets ignored by + * cgroup_kn_lock_live(). The latter makes the csets ignored by * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; @@ -5680,11 +6192,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); - if (parent && cgroup_is_threaded(cgrp)) + if (cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); - for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { + for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; /* @@ -5698,7 +6210,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup1_check_for_release(parent); - cgroup_bpf_offline(cgrp); + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_OFFLINE, cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -5736,7 +6250,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) pr_debug("Initializing cgroup subsys %s\n", ss->name); - mutex_lock(&cgroup_mutex); + cgroup_lock(); idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); @@ -5760,6 +6274,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) } else { css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); + + BUG_ON(ss_rstat_init(ss)); + BUG_ON(css_rstat_init(css)); } /* Update the init_css_set to contain a subsys @@ -5780,7 +6297,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) BUG_ON(online_css(css)); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); } /** @@ -5808,6 +6325,8 @@ int __init cgroup_init_early(void) ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); + WARN(ss->early_init && ss->css_rstat_flush, + "cgroup rstat cannot be used with early init subsystem\n"); ss->id = i; ss->name = cgroup_subsys_name[i]; @@ -5833,19 +6352,15 @@ int __init cgroup_init(void) BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); - cgroup_rstat_boot(); - - /* - * The latency of the synchronize_rcu() is too high for cgroups, - * avoid it at the cost of forcing all readers into the slow path. - */ - rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); + BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); + cgroup_rt_init(); - mutex_lock(&cgroup_mutex); + cgroup_lock(); /* * Add init_css_set to the hash table so that dfl_root can link to @@ -5854,9 +6369,11 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); + cgroup_bpf_lifetime_notifier_init(); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); for_each_subsys(ss, ssid) { if (ss->early_init) { @@ -5882,8 +6399,8 @@ int __init cgroup_init(void) continue; if (cgroup1_ssid_disabled(ssid)) - printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", - ss->name); + pr_info("Disabling %s control group subsystem in v1 mounts\n", + ss->legacy_name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; @@ -5908,9 +6425,9 @@ int __init cgroup_init(void) if (ss->bind) ss->bind(init_css_set.subsys[ssid]); - mutex_lock(&cgroup_mutex); + cgroup_lock(); css_populate_dir(init_css_set.subsys[ssid]); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); } /* init_css_set.subsys[] has been updated, re-hash */ @@ -5922,10 +6439,11 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 WARN_ON(register_filesystem(&cpuset_fs_type)); #endif + ns_tree_add(&init_cgroup_ns); return 0; } @@ -5939,8 +6457,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); @@ -5957,18 +6481,24 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) } /* - * cgroup_get_from_id : get the cgroup associated with cgroup id + * __cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id - * On success return the cgrp, on failure return NULL + * On success return the cgrp or ERR_PTR on failure + * There are no cgroup NS restrictions. */ -struct cgroup *cgroup_get_from_id(u64 id) +struct cgroup *__cgroup_get_from_id(u64 id) { struct kernfs_node *kn; - struct cgroup *cgrp = NULL; + struct cgroup *cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) - goto out; + return ERR_PTR(-ENOENT); + + if (kernfs_type(kn) != KERNFS_DIR) { + kernfs_put(kn); + return ERR_PTR(-ENOENT); + } rcu_read_lock(); @@ -5977,9 +6507,33 @@ struct cgroup *cgroup_get_from_id(u64 id) cgrp = NULL; rcu_read_unlock(); - kernfs_put(kn); -out: + + if (!cgrp) + return ERR_PTR(-ENOENT); + return cgrp; +} + +/* + * cgroup_get_from_id : get the cgroup associated with cgroup id + * @id: cgroup id + * On success return the cgrp or ERR_PTR on failure + * Only cgroups within current task's cgroup NS are valid. + */ +struct cgroup *cgroup_get_from_id(u64 id) +{ + struct cgroup *cgrp, *root_cgrp; + + cgrp = __cgroup_get_from_id(id); + if (IS_ERR(cgrp)) + return cgrp; + + root_cgrp = current_cgns_cgroup_dfl(); + if (!cgroup_is_descendant(cgrp, root_cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-ENOENT); + } + return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_id); @@ -6001,7 +6555,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - mutex_lock(&cgroup_mutex); + rcu_read_lock(); spin_lock_irq(&css_set_lock); for_each_root(root) { @@ -6009,7 +6563,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root && !cgrp_dfl_visible) + if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) + continue; + + cgrp = task_cgroup_from_root(tsk, root); + /* The root has already been unmounted. */ + if (!cgrp) continue; seq_printf(m, "%d:", root->hierarchy_id); @@ -6022,9 +6581,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); - - cgrp = task_cgroup_from_root(tsk, root); - /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, @@ -6037,7 +6593,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (retval >= PATH_MAX) + if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) goto out_unlock; @@ -6056,7 +6612,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + rcu_read_unlock(); kfree(buf); out: return retval; @@ -6075,16 +6631,37 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } -static struct cgroup *cgroup_get_from_file(struct file *f) +/** + * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer + * @f: file corresponding to cgroup_dir + * + * Find the cgroup from a file pointer associated with a cgroup directory. + * Returns a pointer to the cgroup on success. ERR_PTR is returned if the + * cgroup cannot be found. + */ +static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) { struct cgroup_subsys_state *css; - struct cgroup *cgrp; css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css); - cgrp = css->cgroup; + return css->cgroup; +} + +/** + * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports + * cgroup2. + * @f: file corresponding to cgroup2_dir + */ +static struct cgroup *cgroup_get_from_file(struct file *f) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_file(f); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); @@ -6116,16 +6693,19 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) struct cgroup *dst_cgrp = NULL; struct css_set *cset; struct super_block *sb; - struct file *f; if (kargs->flags & CLONE_INTO_CGROUP) - mutex_lock(&cgroup_mutex); + cgroup_lock(); cgroup_threadgroup_change_begin(current); spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); + if (kargs->cgrp) + kargs->kill_seq = kargs->cgrp->kill_seq; + else + kargs->kill_seq = cset->dfl_cgrp->kill_seq; spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { @@ -6133,14 +6713,14 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) return 0; } - f = fget_raw(kargs->cgroup); - if (!f) { + CLASS(fd_raw, f)(kargs->cgroup); + if (fd_empty(f)) { ret = -EBADF; goto err; } - sb = f->f_path.dentry->d_sb; + sb = fd_file(f)->f_path.dentry->d_sb; - dst_cgrp = cgroup_get_from_file(f); + dst_cgrp = cgroup_get_from_file(fd_file(f)); if (IS_ERR(dst_cgrp)) { ret = PTR_ERR(dst_cgrp); dst_cgrp = NULL; @@ -6161,6 +6741,20 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) if (ret) goto err; + /* + * Spawning a task directly into a cgroup works by passing a file + * descriptor to the target cgroup directory. This can even be an O_PATH + * file descriptor. But it can never be a cgroup.procs file descriptor. + * This was done on purpose so spawning into a cgroup could be + * conceptualized as an atomic + * + * fd = openat(dfd_cgroup, "cgroup.procs", ...); + * write(fd, <child-pid>, ...); + * + * sequence, i.e. it's a shorthand for the caller opening and writing + * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us + * to always use the caller's credentials. + */ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, !(kargs->flags & CLONE_THREAD), current->nsproxy->cgroup_ns); @@ -6174,15 +6768,12 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) } put_css_set(cset); - fput(f); kargs->cgrp = dst_cgrp; return ret; err: cgroup_threadgroup_change_end(current); - mutex_unlock(&cgroup_mutex); - if (f) - fput(f); + cgroup_unlock(); if (dst_cgrp) cgroup_put(dst_cgrp); put_css_set(cset); @@ -6201,19 +6792,18 @@ err: static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { - cgroup_threadgroup_change_end(current); + struct cgroup *cgrp = kargs->cgrp; + struct css_set *cset = kargs->cset; - if (kargs->flags & CLONE_INTO_CGROUP) { - struct cgroup *cgrp = kargs->cgrp; - struct css_set *cset = kargs->cset; - - mutex_unlock(&cgroup_mutex); + cgroup_threadgroup_change_end(current); - if (cset) { - put_css_set(cset); - kargs->cset = NULL; - } + if (cset) { + put_css_set(cset); + kargs->cset = NULL; + } + if (kargs->flags & CLONE_INTO_CGROUP) { + cgroup_unlock(); if (cgrp) { cgroup_put(cgrp); kargs->cgrp = NULL; @@ -6224,6 +6814,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process + * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). @@ -6286,6 +6877,7 @@ void cgroup_cancel_fork(struct task_struct *child, /** * cgroup_post_fork - finalize cgroup setup for the child process * @child: the child process + * @kargs: the arguments passed to create the child process * * Attach the child process to its css_set calling the subsystem fork() * callbacks. @@ -6294,6 +6886,7 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned int cgrp_kill_seq = 0; unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; @@ -6307,10 +6900,13 @@ void cgroup_post_fork(struct task_struct *child, /* init tasks are special, only link regular threads */ if (likely(child->pid)) { - if (kargs->cgrp) + if (kargs->cgrp) { cgrp_flags = kargs->cgrp->flags; - else + cgrp_kill_seq = kargs->cgrp->kill_seq; + } else { cgrp_flags = cset->dfl_cgrp->flags; + cgrp_kill_seq = cset->dfl_cgrp->kill_seq; + } WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; @@ -6345,7 +6941,7 @@ void cgroup_post_fork(struct task_struct *child, * child down right after we finished preparing it for * userspace. */ - kill = test_bit(CGRP_KILL, &cgrp_flags); + kill = kargs->kill_seq != cgrp_kill_seq; } spin_unlock_irq(&css_set_lock); @@ -6376,40 +6972,101 @@ void cgroup_post_fork(struct task_struct *child, } /** - * cgroup_exit - detach cgroup from exiting task + * cgroup_task_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ -void cgroup_exit(struct task_struct *tsk) +void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; - struct css_set *cset; int i; - spin_lock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ + do_each_subsys_mask(ss, i, have_exit_callback) { + ss->exit(tsk); + } while_each_subsys_mask(); +} + +static void do_cgroup_task_dead(struct task_struct *tsk) +{ + struct css_set *cset; + unsigned long flags; + + spin_lock_irqsave(&css_set_lock, flags); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; + /* matches the signal->live check in css_task_iter_advance() */ + if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) + list_add_tail(&tsk->cg_list, &cset->dying_tasks); + + if (dl_task(tsk)) + dec_dl_tasks_cs(tsk); WARN_ON_ONCE(cgroup_task_frozen(tsk)); if (unlikely(!(tsk->flags & PF_KTHREAD) && test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, flags); +} - /* see cgroup_post_fork() for details */ - do_each_subsys_mask(ss, i, have_exit_callback) { - ss->exit(tsk); - } while_each_subsys_mask(); +#ifdef CONFIG_PREEMPT_RT +/* + * cgroup_task_dead() is called from finish_task_switch() which doesn't allow + * scheduling even in RT. As the task_dead path requires grabbing css_set_lock, + * this lead to sleeping in the invalid context warning bug. css_set_lock is too + * big to become a raw_spinlock. The task_dead path doesn't need to run + * synchronously but can't be delayed indefinitely either as the dead task pins + * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy + * irq_work to allow batching while ensuring timely completion. + */ +static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks); +static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork); + +static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork) +{ + struct llist_node *lnode; + struct task_struct *task, *next; + + lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks)); + llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) { + do_cgroup_task_dead(task); + put_task_struct(task); + } } -void cgroup_release(struct task_struct *task) +static void __init cgroup_rt_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu)); + per_cpu(cgrp_dead_tasks_iwork, cpu) = + IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn); + } +} + +void cgroup_task_dead(struct task_struct *task) +{ + get_task_struct(task); + llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks)); + irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork)); +} +#else /* CONFIG_PREEMPT_RT */ +static void __init cgroup_rt_init(void) {} + +void cgroup_task_dead(struct task_struct *task) +{ + do_cgroup_task_dead(task); +} +#endif /* CONFIG_PREEMPT_RT */ + +void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; @@ -6417,16 +7074,19 @@ void cgroup_release(struct task_struct *task) do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); - - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); } -void cgroup_free(struct task_struct *task) +void cgroup_task_free(struct task_struct *task) { struct css_set *cset = task_css_set(task); + + if (!list_empty(&task->cg_list)) { + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); + } + put_css_set(cset); } @@ -6473,6 +7133,12 @@ static int __init enable_cgroup_debug(char *str) } __setup("cgroup_debug", enable_cgroup_debug); +static int __init cgroup_favordynmods_setup(char *str) +{ + return (kstrtobool(str, &have_favordynmods) == 0); +} +__setup("cgroup_favordynmods=", cgroup_favordynmods_setup); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -6540,8 +7206,10 @@ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; struct cgroup *cgrp = ERR_PTR(-ENOENT); + struct cgroup *root_cgrp; - kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); + root_cgrp = current_cgns_cgroup_dfl(); + kn = kernfs_walk_and_get(root_cgrp->kn, path); if (!kn) goto out; @@ -6566,25 +7234,39 @@ out: EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** - * cgroup_get_from_fd - get a cgroup pointer from a fd - * @fd: fd obtained by open(cgroup2_dir) + * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ +struct cgroup *cgroup_v1v2_get_from_fd(int fd) +{ + CLASS(fd_raw, f)(fd); + if (fd_empty(f)) + return ERR_PTR(-EBADF); + + return cgroup_v1v2_get_from_file(fd_file(f)); +} + +/** + * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports + * cgroup2. + * @fd: fd obtained by open(cgroup2_dir) + */ struct cgroup *cgroup_get_from_fd(int fd) { - struct cgroup *cgrp; - struct file *f; + struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); - f = fget_raw(fd); - if (!f) - return ERR_PTR(-EBADF); + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); - cgrp = cgroup_get_from_file(f); - fput(f); + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); @@ -6699,9 +7381,6 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf, if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) continue; - if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) - continue; - if (prefix) ret += snprintf(buf + ret, size - ret, "%s.", prefix); @@ -6721,8 +7400,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, int ssid; ssize_t ret = 0; - ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, - NULL); + ret = show_delegatable_files(cgroup_base_files, buf + ret, + PAGE_SIZE - ret, NULL); + if (cgroup_psi_enabled()) + ret += show_delegatable_files(cgroup_psi_files, buf + ret, + PAGE_SIZE - ret, NULL); for_each_subsys(ss, ssid) ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, @@ -6738,8 +7420,11 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, { return snprintf(buf, PAGE_SIZE, "nsdelegate\n" + "favordynmods\n" "memory_localevents\n" - "memory_recursiveprot\n"); + "memory_recursiveprot\n" + "memory_hugetlb_accounting\n" + "pids_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h new file mode 100644 index 000000000000..01976c8e7d49 --- /dev/null +++ b/kernel/cgroup/cpuset-internal.h @@ -0,0 +1,308 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __CPUSET_INTERNAL_H +#define __CPUSET_INTERNAL_H + +#include <linux/cgroup.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpuset.h> +#include <linux/spinlock.h> +#include <linux/union_find.h> + +/* See "Frequency meter" comments, below. */ + +struct fmeter { + int cnt; /* unprocessed events count */ + int val; /* most recent output value */ + time64_t time; /* clock (secs) when val computed */ + spinlock_t lock; /* guards read or write of above */ +}; + +/* + * Invalid partition error code + */ +enum prs_errcode { + PERR_NONE = 0, + PERR_INVCPUS, + PERR_INVPARENT, + PERR_NOTPART, + PERR_NOTEXCL, + PERR_NOCPUS, + PERR_HOTPLUG, + PERR_CPUSEMPTY, + PERR_HKEEPING, + PERR_ACCESS, + PERR_REMOTE, +}; + +/* bits in struct cpuset flags field */ +typedef enum { + CS_CPU_EXCLUSIVE, + CS_MEM_EXCLUSIVE, + CS_MEM_HARDWALL, + CS_MEMORY_MIGRATE, + CS_SCHED_LOAD_BALANCE, + CS_SPREAD_PAGE, + CS_SPREAD_SLAB, +} cpuset_flagbits_t; + +/* The various types of files and directories in a cpuset file system */ + +typedef enum { + FILE_MEMORY_MIGRATE, + FILE_CPULIST, + FILE_MEMLIST, + FILE_EFFECTIVE_CPULIST, + FILE_EFFECTIVE_MEMLIST, + FILE_SUBPARTS_CPULIST, + FILE_EXCLUSIVE_CPULIST, + FILE_EFFECTIVE_XCPULIST, + FILE_ISOLATED_CPULIST, + FILE_CPU_EXCLUSIVE, + FILE_MEM_EXCLUSIVE, + FILE_MEM_HARDWALL, + FILE_SCHED_LOAD_BALANCE, + FILE_PARTITION_ROOT, + FILE_SCHED_RELAX_DOMAIN_LEVEL, + FILE_MEMORY_PRESSURE_ENABLED, + FILE_MEMORY_PRESSURE, + FILE_SPREAD_PAGE, + FILE_SPREAD_SLAB, +} cpuset_filetype_t; + +struct cpuset { + struct cgroup_subsys_state css; + + unsigned long flags; /* "unsigned long" so bitops work */ + + /* + * On default hierarchy: + * + * The user-configured masks can only be changed by writing to + * cpuset.cpus and cpuset.mems, and won't be limited by the + * parent masks. + * + * The effective masks is the real masks that apply to the tasks + * in the cpuset. They may be changed if the configured masks are + * changed or hotplug happens. + * + * effective_mask == configured_mask & parent's effective_mask, + * and if it ends up empty, it will inherit the parent's mask. + * + * + * On legacy hierarchy: + * + * The user-configured masks are always the same with effective masks. + */ + + /* user-configured CPUs and Memory Nodes allow to tasks */ + cpumask_var_t cpus_allowed; + nodemask_t mems_allowed; + + /* effective CPUs and Memory Nodes allow to tasks */ + cpumask_var_t effective_cpus; + nodemask_t effective_mems; + + /* + * Exclusive CPUs dedicated to current cgroup (default hierarchy only) + * + * The effective_cpus of a valid partition root comes solely from its + * effective_xcpus and some of the effective_xcpus may be distributed + * to sub-partitions below & hence excluded from its effective_cpus. + * For a valid partition root, its effective_cpus have no relationship + * with cpus_allowed unless its exclusive_cpus isn't set. + * + * This value will only be set if either exclusive_cpus is set or + * when this cpuset becomes a local partition root. + */ + cpumask_var_t effective_xcpus; + + /* + * Exclusive CPUs as requested by the user (default hierarchy only) + * + * Its value is independent of cpus_allowed and designates the set of + * CPUs that can be granted to the current cpuset or its children when + * it becomes a valid partition root. The effective set of exclusive + * CPUs granted (effective_xcpus) depends on whether those exclusive + * CPUs are passed down by its ancestors and not yet taken up by + * another sibling partition root along the way. + * + * If its value isn't set, it defaults to cpus_allowed. + */ + cpumask_var_t exclusive_cpus; + + /* + * This is old Memory Nodes tasks took on. + * + * - top_cpuset.old_mems_allowed is initialized to mems_allowed. + * - A new cpuset's old_mems_allowed is initialized when some + * task is moved into it. + * - old_mems_allowed is used in cpuset_migrate_mm() when we change + * cpuset.mems_allowed and have tasks' nodemask updated, and + * then old_mems_allowed is updated to mems_allowed. + */ + nodemask_t old_mems_allowed; + + struct fmeter fmeter; /* memory_pressure filter */ + + /* + * Tasks are being attached to this cpuset. Used to prevent + * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). + */ + int attach_in_progress; + + /* for custom sched domain */ + int relax_domain_level; + + /* partition root state */ + int partition_root_state; + + /* + * Whether cpuset is a remote partition. + * It used to be a list anchoring all remote partitions — we can switch back + * to a list if we need to iterate over the remote partitions. + */ + bool remote_partition; + + /* + * number of SCHED_DEADLINE tasks attached to this cpuset, so that we + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; + int nr_migrate_dl_tasks; + u64 sum_migrate_dl_bw; + + /* Invalid partition error code, not lock protected */ + enum prs_errcode prs_err; + + /* Handle for cpuset.cpus.partition */ + struct cgroup_file partition_file; + + /* Used to merge intersecting subsets for generate_sched_domains */ + struct uf_node node; +}; + +static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuset, css) : NULL; +} + +/* Retrieve the cpuset for a task */ +static inline struct cpuset *task_cs(struct task_struct *task) +{ + return css_cs(task_css(task, cpuset_cgrp_id)); +} + +static inline struct cpuset *parent_cs(struct cpuset *cs) +{ + return css_cs(cs->css.parent); +} + +/* convenient tests for these bits */ +static inline bool is_cpuset_online(struct cpuset *cs) +{ + return css_is_online(&cs->css) && !css_is_dying(&cs->css); +} + +static inline int is_cpu_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_hardwall(const struct cpuset *cs) +{ + return test_bit(CS_MEM_HARDWALL, &cs->flags); +} + +static inline int is_sched_load_balance(const struct cpuset *cs) +{ + return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); +} + +static inline int is_memory_migrate(const struct cpuset *cs) +{ + return test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + +static inline int is_spread_page(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_PAGE, &cs->flags); +} + +static inline int is_spread_slab(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_SLAB, &cs->flags); +} + +/** + * cpuset_for_each_child - traverse online children of a cpuset + * @child_cs: loop cursor pointing to the current child + * @pos_css: used for iteration + * @parent_cs: target cpuset to walk children of + * + * Walk @child_cs through the online children of @parent_cs. Must be used + * with RCU read locked. + */ +#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ + css_for_each_child((pos_css), &(parent_cs)->css) \ + if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) + +/** + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants + * @des_cs: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @root_cs: target cpuset to walk ancestor of + * + * Walk @des_cs through the online descendants of @root_cs. Must be used + * with RCU read locked. The caller may modify @pos_css by calling + * css_rightmost_descendant() to skip subtree. @root_cs is included in the + * iteration and the first node to be visited. + */ +#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ + css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) + +void rebuild_sched_domains_locked(void); +void cpuset_callback_lock_irq(void); +void cpuset_callback_unlock_irq(void); +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus); +void cpuset_update_tasks_nodemask(struct cpuset *cs); +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int cpuset_common_seq_show(struct seq_file *sf, void *v); +void cpuset_full_lock(void); +void cpuset_full_unlock(void); + +/* + * cpuset-v1.c + */ +#ifdef CONFIG_CPUSETS_V1 +extern struct cftype cpuset1_files[]; +void fmeter_init(struct fmeter *fmp); +void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk); +void cpuset1_update_tasks_flags(struct cpuset *cs); +void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated); +int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +#else +static inline void fmeter_init(struct fmeter *fmp) {} +static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk) {} +static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} +static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) {} +static inline int cpuset1_validate_change(struct cpuset *cur, + struct cpuset *trial) { return 0; } +#endif /* CONFIG_CPUSETS_V1 */ + +#endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c new file mode 100644 index 000000000000..12e76774c75b --- /dev/null +++ b/kernel/cgroup/cpuset-v1.c @@ -0,0 +1,607 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "cgroup-internal.h" +#include "cpuset-internal.h" + +/* + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously + */ +struct cpuset_remove_tasks_struct { + struct work_struct work; + struct cpuset *cs; +}; + +/* + * Frequency meter - How fast is some event occurring? + * + * These routines manage a digitally filtered, constant time based, + * event frequency meter. There are four routines: + * fmeter_init() - initialize a frequency meter. + * fmeter_markevent() - called each time the event happens. + * fmeter_getrate() - returns the recent rate of such events. + * fmeter_update() - internal routine used to update fmeter. + * + * A common data structure is passed to each of these routines, + * which is used to keep track of the state required to manage the + * frequency meter and its digital filter. + * + * The filter works on the number of events marked per unit time. + * The filter is single-pole low-pass recursive (IIR). The time unit + * is 1 second. Arithmetic is done using 32-bit integers scaled to + * simulate 3 decimal digits of precision (multiplied by 1000). + * + * With an FM_COEF of 933, and a time base of 1 second, the filter + * has a half-life of 10 seconds, meaning that if the events quit + * happening, then the rate returned from the fmeter_getrate() + * will be cut in half each 10 seconds, until it converges to zero. + * + * It is not worth doing a real infinitely recursive filter. If more + * than FM_MAXTICKS ticks have elapsed since the last filter event, + * just compute FM_MAXTICKS ticks worth, by which point the level + * will be stable. + * + * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid + * arithmetic overflow in the fmeter_update() routine. + * + * Given the simple 32 bit integer arithmetic used, this meter works + * best for reporting rates between one per millisecond (msec) and + * one per 32 (approx) seconds. At constant rates faster than one + * per msec it maxes out at values just under 1,000,000. At constant + * rates between one per msec, and one per second it will stabilize + * to a value N*1000, where N is the rate of events per second. + * At constant rates between one per second and one per 32 seconds, + * it will be choppy, moving up on the seconds that have an event, + * and then decaying until the next event. At rates slower than + * about one in 32 seconds, it decays all the way back to zero between + * each event. + */ + +#define FM_COEF 933 /* coefficient for half-life of 10 secs */ +#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ +#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ +#define FM_SCALE 1000 /* faux fixed point scale */ + +/* Initialize a frequency meter */ +void fmeter_init(struct fmeter *fmp) +{ + fmp->cnt = 0; + fmp->val = 0; + fmp->time = 0; + spin_lock_init(&fmp->lock); +} + +/* Internal meter update - process cnt events and update value */ +static void fmeter_update(struct fmeter *fmp) +{ + time64_t now; + u32 ticks; + + now = ktime_get_seconds(); + ticks = now - fmp->time; + + if (ticks == 0) + return; + + ticks = min(FM_MAXTICKS, ticks); + while (ticks-- > 0) + fmp->val = (FM_COEF * fmp->val) / FM_SCALE; + fmp->time = now; + + fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; + fmp->cnt = 0; +} + +/* Process any previous ticks, then bump cnt by one (times scale). */ +static void fmeter_markevent(struct fmeter *fmp) +{ + spin_lock(&fmp->lock); + fmeter_update(fmp); + fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); + spin_unlock(&fmp->lock); +} + +/* Process any previous ticks, then return current value. */ +static int fmeter_getrate(struct fmeter *fmp) +{ + int val; + + spin_lock(&fmp->lock); + fmeter_update(fmp); + val = fmp->val; + spin_unlock(&fmp->lock); + return val; +} + +/* + * Collection of memory_pressure is suppressed unless + * this flag is enabled by writing "1" to the special + * cpuset file 'memory_pressure_enabled' in the root cpuset. + */ + +int cpuset_memory_pressure_enabled __read_mostly; + +/* + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. + * + * Keep a running average of the rate of synchronous (direct) + * page reclaim efforts initiated by tasks in each cpuset. + * + * This represents the rate at which some task in the cpuset + * ran low on memory on all nodes it was allowed to use, and + * had to enter the kernels page reclaim code in an effort to + * create more free memory by tossing clean pages or swapping + * or writing dirty pages. + * + * Display to user space in the per-cpuset read-only file + * "memory_pressure". Value displayed is an integer + * representing the recent rate of entry into the synchronous + * (direct) page reclaim by any task attached to the cpuset. + */ + +void __cpuset_memory_pressure_bump(void) +{ + rcu_read_lock(); + fmeter_markevent(&task_cs(current)->fmeter); + rcu_read_unlock(); +} + +static int update_relax_domain_level(struct cpuset *cs, s64 val) +{ +#ifdef CONFIG_SMP + if (val < -1 || val > sched_domain_level_max + 1) + return -EINVAL; +#endif + + if (val != cs->relax_domain_level) { + cs->relax_domain_level = val; + if (!cpumask_empty(cs->cpus_allowed) && + is_sched_load_balance(cs)) + rebuild_sched_domains_locked(); + } + + return 0; +} + +static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + int retval = -ENODEV; + + cpuset_full_lock(); + if (!is_cpuset_online(cs)) + goto out_unlock; + + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); + retval = update_relax_domain_level(cs, val); + break; + default: + retval = -EINVAL; + break; + } +out_unlock: + cpuset_full_unlock(); + return retval; +} + +static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + return cs->relax_domain_level; + default: + BUG(); + } + + /* Unreachable but makes gcc happy */ + return 0; +} + +/* + * update task's spread flag if cpuset's page/slab spread flag is set + * + * Call with callback_lock or cpuset_mutex held. The check can be skipped + * if on default hierarchy. + */ +void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk) +{ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + return; + + if (is_spread_page(cs)) + task_set_spread_page(tsk); + else + task_clear_spread_page(tsk); + + if (is_spread_slab(cs)) + task_set_spread_slab(tsk); + else + task_clear_spread_slab(tsk); +} + +/** + * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset. + * @cs: the cpuset in which each task's spread flags needs to be changed + * + * Iterate through each task of @cs updating its spread flags. As this + * function is called with cpuset_mutex held, cpuset membership stays + * stable. + */ +void cpuset1_update_tasks_flags(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + while ((task = css_task_iter_next(&it))) + cpuset1_update_task_spread_flags(cs, task); + css_task_iter_end(&it); +} + +/* + * If CPU and/or memory hotplug handlers, below, unplug any CPUs + * or memory nodes, we need to walk over the cpuset hierarchy, + * removing that CPU or node from all cpusets. If this removes the + * last CPU or node from a cpuset, then move the tasks in the empty + * cpuset to its next-highest non-empty parent. + */ +static void remove_tasks_in_empty_cpuset(struct cpuset *cs) +{ + struct cpuset *parent; + + /* + * Find its next-highest non-empty parent, (top cpuset + * has online cpus, so can't be empty). + */ + parent = parent_cs(cs); + while (cpumask_empty(parent->cpus_allowed) || + nodes_empty(parent->mems_allowed)) + parent = parent_cs(parent); + + if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { + pr_err("cpuset: failed to transfer tasks out of empty cpuset "); + pr_cont_cgroup_name(cs->css.cgroup); + pr_cont("\n"); + } +} + +static void cpuset_migrate_tasks_workfn(struct work_struct *work) +{ + struct cpuset_remove_tasks_struct *s; + + s = container_of(work, struct cpuset_remove_tasks_struct, work); + remove_tasks_in_empty_cpuset(s->cs); + css_put(&s->cs->css); + kfree(s); +} + +void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + bool is_empty; + + cpuset_callback_lock_irq(); + cpumask_copy(cs->cpus_allowed, new_cpus); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->mems_allowed = *new_mems; + cs->effective_mems = *new_mems; + cpuset_callback_unlock_irq(); + + /* + * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty, + * as the tasks will be migrated to an ancestor. + */ + if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) + cpuset_update_tasks_cpumask(cs, new_cpus); + if (mems_updated && !nodes_empty(cs->mems_allowed)) + cpuset_update_tasks_nodemask(cs); + + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + /* + * Move tasks to the nearest ancestor with execution resources, + * This is full cgroup operation which will also call back into + * cpuset. Execute it asynchronously using workqueue. + */ + if (is_empty && cs->css.cgroup->nr_populated_csets && + css_tryget_online(&cs->css)) { + struct cpuset_remove_tasks_struct *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (WARN_ON_ONCE(!s)) { + css_put(&cs->css); + return; + } + + s->cs = cs; + INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); + schedule_work(&s->work); + } +} + +/* + * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? + * + * One cpuset is a subset of another if all its allowed CPUs and + * Memory Nodes are a subset of the other, and its exclusive flags + * are only set if the other's are set. Call holding cpuset_mutex. + */ + +static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) +{ + return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + nodes_subset(p->mems_allowed, q->mems_allowed) && + is_cpu_exclusive(p) <= is_cpu_exclusive(q) && + is_mem_exclusive(p) <= is_mem_exclusive(q); +} + +/* + * cpuset1_validate_change() - Validate conditions specific to legacy (v1) + * behavior. + */ +int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + +#ifdef CONFIG_PROC_PID_CPUSET +/* + * proc_cpuset_show() + * - Print tasks cpuset path into seq_file. + * - Used for /proc/<pid>/cpuset. + */ +int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + char *buf; + struct cgroup_subsys_state *css; + int retval; + + retval = -ENOMEM; + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + goto out; + + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + + if (retval == -E2BIG) + retval = -ENAMETOOLONG; + if (retval < 0) + goto out_free; + seq_puts(m, buf); + seq_putc(m, '\n'); + retval = 0; +out_free: + kfree(buf); +out: + return retval; +} +#endif /* CONFIG_PROC_PID_CPUSET */ + +static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + + switch (type) { + case FILE_CPU_EXCLUSIVE: + return is_cpu_exclusive(cs); + case FILE_MEM_EXCLUSIVE: + return is_mem_exclusive(cs); + case FILE_MEM_HARDWALL: + return is_mem_hardwall(cs); + case FILE_SCHED_LOAD_BALANCE: + return is_sched_load_balance(cs); + case FILE_MEMORY_MIGRATE: + return is_memory_migrate(cs); + case FILE_MEMORY_PRESSURE_ENABLED: + return cpuset_memory_pressure_enabled; + case FILE_MEMORY_PRESSURE: + return fmeter_getrate(&cs->fmeter); + case FILE_SPREAD_PAGE: + return is_spread_page(cs); + case FILE_SPREAD_SLAB: + return is_spread_slab(cs); + default: + BUG(); + } + + /* Unreachable but makes gcc happy */ + return 0; +} + +static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + int retval = 0; + + cpuset_full_lock(); + if (!is_cpuset_online(cs)) { + retval = -ENODEV; + goto out_unlock; + } + + switch (type) { + case FILE_CPU_EXCLUSIVE: + retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); + break; + case FILE_MEM_EXCLUSIVE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); + retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); + break; + case FILE_MEM_HARDWALL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); + retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); + break; + case FILE_SCHED_LOAD_BALANCE: + pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name); + retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); + break; + case FILE_MEMORY_MIGRATE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); + retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); + break; + case FILE_MEMORY_PRESSURE_ENABLED: + pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name); + cpuset_memory_pressure_enabled = !!val; + break; + case FILE_SPREAD_PAGE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); + retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); + break; + case FILE_SPREAD_SLAB: + pr_warn_once("cpuset.%s is deprecated\n", cft->name); + retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); + break; + default: + retval = -EINVAL; + break; + } +out_unlock: + cpuset_full_unlock(); + return retval; +} + +/* + * for the common functions, 'private' gives the type of file + */ + +struct cftype cpuset1_files[] = { + { + .name = "cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_CPULIST, + }, + + { + .name = "mems", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * MAX_NUMNODES), + .private = FILE_MEMLIST, + }, + + { + .name = "effective_cpus", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_CPULIST, + }, + + { + .name = "effective_mems", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_MEMLIST, + }, + + { + .name = "cpu_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_CPU_EXCLUSIVE, + }, + + { + .name = "mem_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_EXCLUSIVE, + }, + + { + .name = "mem_hardwall", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_HARDWALL, + }, + + { + .name = "sched_load_balance", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SCHED_LOAD_BALANCE, + }, + + { + .name = "sched_relax_domain_level", + .read_s64 = cpuset_read_s64, + .write_s64 = cpuset_write_s64, + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, + }, + + { + .name = "memory_migrate", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_MIGRATE, + }, + + { + .name = "memory_pressure", + .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, + }, + + { + .name = "memory_spread_page", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_PAGE, + }, + + { + /* obsolete, may be removed in the future */ + .name = "memory_spread_slab", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_SLAB, + }, + + { + .name = "memory_pressure_enabled", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_PRESSURE_ENABLED, + }, + + { } /* terminate */ +}; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index dc653ab26e50..6e6eb09b8db6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,174 +21,120 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ +#include "cpuset-internal.h" -#include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/cpuset.h> -#include <linux/err.h> -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/fs.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> -#include <linux/kmod.h> -#include <linux/list.h> #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> #include <linux/export.h> -#include <linux/mount.h> -#include <linux/fs_context.h> -#include <linux/namei.h> -#include <linux/pagemap.h> -#include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/deadline.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> -#include <linux/seq_file.h> #include <linux/security.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/time.h> -#include <linux/time64.h> -#include <linux/backing-dev.h> -#include <linux/sort.h> #include <linux/oom.h> #include <linux/sched/isolation.h> -#include <linux/uaccess.h> -#include <linux/atomic.h> -#include <linux/mutex.h> -#include <linux/cgroup.h> #include <linux/wait.h> +#include <linux/workqueue.h> +#include <linux/task_work.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* * There could be abnormal cpuset configurations for cpu or memory - * node binding, add this key to provide a quick low-cost judgement + * node binding, add this key to provide a quick low-cost judgment * of the situation. */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); -/* See "Frequency meter" comments, below. */ - -struct fmeter { - int cnt; /* unprocessed events count */ - int val; /* most recent output value */ - time64_t time; /* clock (secs) when val computed */ - spinlock_t lock; /* guards read or write of above */ +static const char * const perr_strings[] = { + [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", + [PERR_INVPARENT] = "Parent is an invalid partition root", + [PERR_NOTPART] = "Parent is not a partition root", + [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", + [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", + [PERR_HOTPLUG] = "No cpu available due to hotplug", + [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", + [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", + [PERR_ACCESS] = "Enable partition not permitted", + [PERR_REMOTE] = "Have remote partition underneath", }; -struct cpuset { - struct cgroup_subsys_state css; - - unsigned long flags; /* "unsigned long" so bitops work */ - - /* - * On default hierarchy: - * - * The user-configured masks can only be changed by writing to - * cpuset.cpus and cpuset.mems, and won't be limited by the - * parent masks. - * - * The effective masks is the real masks that apply to the tasks - * in the cpuset. They may be changed if the configured masks are - * changed or hotplug happens. - * - * effective_mask == configured_mask & parent's effective_mask, - * and if it ends up empty, it will inherit the parent's mask. - * - * - * On legacy hierarchy: - * - * The user-configured masks are always the same with effective masks. - */ - - /* user-configured CPUs and Memory Nodes allow to tasks */ - cpumask_var_t cpus_allowed; - nodemask_t mems_allowed; - - /* effective CPUs and Memory Nodes allow to tasks */ - cpumask_var_t effective_cpus; - nodemask_t effective_mems; - - /* - * CPUs allocated to child sub-partitions (default hierarchy only) - * - CPUs granted by the parent = effective_cpus U subparts_cpus - * - effective_cpus and subparts_cpus are mutually exclusive. - * - * effective_cpus contains only onlined CPUs, but subparts_cpus - * may have offlined ones. - */ - cpumask_var_t subparts_cpus; - - /* - * This is old Memory Nodes tasks took on. - * - * - top_cpuset.old_mems_allowed is initialized to mems_allowed. - * - A new cpuset's old_mems_allowed is initialized when some - * task is moved into it. - * - old_mems_allowed is used in cpuset_migrate_mm() when we change - * cpuset.mems_allowed and have tasks' nodemask updated, and - * then old_mems_allowed is updated to mems_allowed. - */ - nodemask_t old_mems_allowed; - - struct fmeter fmeter; /* memory_pressure filter */ - - /* - * Tasks are being attached to this cpuset. Used to prevent - * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). - */ - int attach_in_progress; - - /* partition number for rebuild_sched_domains() */ - int pn; - - /* for custom sched domain */ - int relax_domain_level; +/* + * For local partitions, update to subpartitions_cpus & isolated_cpus is done + * in update_parent_effective_cpumask(). For remote partitions, it is done in + * the remote_partition_*() and remote_cpus_update() helpers. + */ +/* + * Exclusive CPUs distributed out to local or remote sub-partitions of + * top_cpuset + */ +static cpumask_var_t subpartitions_cpus; - /* number of CPUs in subparts_cpus */ - int nr_subparts_cpus; +/* + * Exclusive CPUs in isolated partitions + */ +static cpumask_var_t isolated_cpus; - /* partition root state */ - int partition_root_state; +/* + * isolated_cpus updating flag (protected by cpuset_mutex) + * Set if isolated_cpus is going to be updated in the current + * cpuset_mutex crtical section. + */ +static bool isolated_cpus_updating; - /* - * Default hierarchy only: - * use_parent_ecpus - set if using parent's effective_cpus - * child_ecpus_count - # of children with use_parent_ecpus set - */ - int use_parent_ecpus; - int child_ecpus_count; +/* + * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot + */ +static cpumask_var_t boot_hk_cpus; +static bool have_boot_isolcpus; - /* Handle for cpuset.cpus.partition */ - struct cgroup_file partition_file; -}; +/* + * A flag to force sched domain rebuild at the end of an operation. + * It can be set in + * - update_partition_sd_lb() + * - update_cpumasks_hier() + * - cpuset_update_flag() + * - cpuset_hotplug_update_tasks() + * - cpuset_handle_hotplug() + * + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * + * Note that update_relax_domain_level() in cpuset-v1.c can still call + * rebuild_sched_domains_locked() directly without using this flag. + */ +static bool force_sd_rebuild; /* * Partition root states: * - * 0 - not a partition root - * + * 0 - member (not a partition root) * 1 - partition root - * + * 2 - partition root without load balancing (isolated) * -1 - invalid partition root - * None of the cpus in cpus_allowed can be put into the parent's - * subparts_cpus. In this case, the cpuset is not a real partition - * root anymore. However, the CPU_EXCLUSIVE bit will still be set - * and the cpuset can be restored back to a partition root if the - * parent cpuset can give more CPUs back to this child cpuset. + * -2 - invalid isolated partition root + * + * There are 2 types of partitions - local or remote. Local partitions are + * those whose parents are partition root themselves. Setting of + * cpuset.cpus.exclusive are optional in setting up local partitions. + * Remote partitions are those whose parents are not partition roots. Passing + * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor + * nodes are mandatory in creating a remote partition. + * + * For simplicity, a local partition can be created under a local or remote + * partition but a remote partition cannot have any partition root in its + * ancestor chain except the cgroup root. */ -#define PRS_DISABLED 0 -#define PRS_ENABLED 1 -#define PRS_ERROR -1 +#define PRS_MEMBER 0 +#define PRS_ROOT 1 +#define PRS_ISOLATED 2 +#define PRS_INVALID_ROOT -1 +#define PRS_INVALID_ISOLATED -2 /* * Temporary cpumasks for working with partitions that are passed among @@ -199,141 +145,96 @@ struct tmpmasks { cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; -static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct cpuset, css) : NULL; -} - -/* Retrieve the cpuset for a task */ -static inline struct cpuset *task_cs(struct task_struct *task) -{ - return css_cs(task_css(task, cpuset_cgrp_id)); -} - -static inline struct cpuset *parent_cs(struct cpuset *cs) -{ - return css_cs(cs->css.parent); -} - -/* bits in struct cpuset flags field */ -typedef enum { - CS_ONLINE, - CS_CPU_EXCLUSIVE, - CS_MEM_EXCLUSIVE, - CS_MEM_HARDWALL, - CS_MEMORY_MIGRATE, - CS_SCHED_LOAD_BALANCE, - CS_SPREAD_PAGE, - CS_SPREAD_SLAB, -} cpuset_flagbits_t; - -/* convenient tests for these bits */ -static inline bool is_cpuset_online(struct cpuset *cs) +void inc_dl_tasks_cs(struct task_struct *p) { - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); -} + struct cpuset *cs = task_cs(p); -static inline int is_cpu_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); + cs->nr_deadline_tasks++; } -static inline int is_mem_exclusive(const struct cpuset *cs) +void dec_dl_tasks_cs(struct task_struct *p) { - return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); -} + struct cpuset *cs = task_cs(p); -static inline int is_mem_hardwall(const struct cpuset *cs) -{ - return test_bit(CS_MEM_HARDWALL, &cs->flags); + cs->nr_deadline_tasks--; } -static inline int is_sched_load_balance(const struct cpuset *cs) +static inline bool is_partition_valid(const struct cpuset *cs) { - return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); -} - -static inline int is_memory_migrate(const struct cpuset *cs) -{ - return test_bit(CS_MEMORY_MIGRATE, &cs->flags); + return cs->partition_root_state > 0; } -static inline int is_spread_page(const struct cpuset *cs) +static inline bool is_partition_invalid(const struct cpuset *cs) { - return test_bit(CS_SPREAD_PAGE, &cs->flags); + return cs->partition_root_state < 0; } -static inline int is_spread_slab(const struct cpuset *cs) +static inline bool cs_is_member(const struct cpuset *cs) { - return test_bit(CS_SPREAD_SLAB, &cs->flags); + return cs->partition_root_state == PRS_MEMBER; } -static inline int is_partition_root(const struct cpuset *cs) +/* + * Callers should hold callback_lock to modify partition_root_state. + */ +static inline void make_partition_invalid(struct cpuset *cs) { - return cs->partition_root_state > 0; + if (cs->partition_root_state > 0) + cs->partition_root_state = -cs->partition_root_state; } /* * Send notification event of whenever partition_root_state changes. */ -static inline void notify_partition_change(struct cpuset *cs, - int old_prs, int new_prs) +static inline void notify_partition_change(struct cpuset *cs, int old_prs) { - if (old_prs != new_prs) - cgroup_file_notify(&cs->partition_file); -} - -static struct cpuset top_cpuset = { - .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | - (1 << CS_MEM_EXCLUSIVE)), - .partition_root_state = PRS_ENABLED, -}; + if (old_prs == cs->partition_root_state) + return; + cgroup_file_notify(&cs->partition_file); -/** - * cpuset_for_each_child - traverse online children of a cpuset - * @child_cs: loop cursor pointing to the current child - * @pos_css: used for iteration - * @parent_cs: target cpuset to walk children of - * - * Walk @child_cs through the online children of @parent_cs. Must be used - * with RCU read locked. - */ -#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ - css_for_each_child((pos_css), &(parent_cs)->css) \ - if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) + /* Reset prs_err if not invalid */ + if (is_partition_valid(cs)) + WRITE_ONCE(cs->prs_err, PERR_NONE); +} -/** - * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants - * @des_cs: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @root_cs: target cpuset to walk ancestor of +/* + * The top_cpuset is always synchronized to cpu_active_mask and we should avoid + * using cpu_online_mask as much as possible. An active CPU is always an online + * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ + * during hotplug operations. A CPU is marked active at the last stage of CPU + * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code + * will be called to update the sched domains so that the scheduler can move + * a normal task to a newly active CPU or remove tasks away from a newly + * inactivated CPU. The online bit is set much earlier in the CPU bringup + * process and cleared much later in CPU teardown. * - * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_css by calling - * css_rightmost_descendant() to skip subtree. @root_cs is included in the - * iteration and the first node to be visited. + * If cpu_online_mask is used while a hotunplug operation is happening in + * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ -#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ - css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ - if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) +static struct cpuset top_cpuset = { + .flags = BIT(CS_CPU_EXCLUSIVE) | + BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), + .partition_root_state = PRS_ROOT, + .relax_domain_level = -1, + .remote_partition = false, +}; /* - * There are two global locks guarding cpuset structures - cpuset_rwsem and - * callback_lock. We also require taking task_lock() when dereferencing a - * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. The cpuset code uses only cpuset_rwsem write lock. Other - * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to - * prevent change to cpuset structures. + * There are two global locks guarding cpuset structures - cpuset_mutex and + * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel + * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset + * structures. Note that cpuset_mutex needs to be a mutex as it is used in + * paths that rely on priority inheritance (e.g. scheduler - on RT) for + * correctness. * * A task must hold both locks to modify cpusets. If a task holds - * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it - * is the only task able to also acquire callback_lock and be able to - * modify cpusets. It can perform various checks on the cpuset structure - * first, knowing nothing will change. It can also allocate memory while - * just holding cpuset_rwsem. While it is performing these checks, various - * callback routines can briefly acquire callback_lock to query cpusets. - * Once it is ready to make the changes, it takes callback_lock, blocking - * everyone else. + * cpuset_mutex, it blocks others, ensuring that it is the only task able to + * also acquire callback_lock and be able to modify cpusets. It can perform + * various checks on the cpuset structure first, knowing nothing will change. + * It can also allocate memory while just holding cpuset_mutex. While it is + * performing these checks, various callback routines can briefly acquire + * callback_lock to query cpusets. Once it is ready to make the changes, it + * takes callback_lock, blocking everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_lock, as that would risk double tripping on callback_lock @@ -347,35 +248,60 @@ static struct cpuset top_cpuset = { * by other task, we use alloc_lock in the task_struct fields to protect * them. * - * The cpuset_common_file_read() handlers only hold callback_lock across + * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. + */ + +static DEFINE_MUTEX(cpuset_mutex); + +/** + * cpuset_lock - Acquire the global cpuset mutex * - * Accessing a task's cpuset should be done in accordance with the - * guidelines for accessing subsystem state in kernel/cgroup.c + * This locks the global cpuset mutex to prevent modifications to cpuset + * hierarchy and configurations. This helper is not enough to make modification. */ +void cpuset_lock(void) +{ + mutex_lock(&cpuset_mutex); +} -DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); +void cpuset_unlock(void) +{ + mutex_unlock(&cpuset_mutex); +} -void cpuset_read_lock(void) +/** + * cpuset_full_lock - Acquire full protection for cpuset modification + * + * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex + * to safely modify cpuset data. + */ +void cpuset_full_lock(void) { - percpu_down_read(&cpuset_rwsem); + cpus_read_lock(); + mutex_lock(&cpuset_mutex); } -void cpuset_read_unlock(void) +void cpuset_full_unlock(void) { - percpu_up_read(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); } static DEFINE_SPINLOCK(callback_lock); -static struct workqueue_struct *cpuset_migrate_mm_wq; +void cpuset_callback_lock_irq(void) +{ + spin_lock_irq(&callback_lock); +} -/* - * CPU / memory hotplug is handled asynchronously. - */ -static void cpuset_hotplug_workfn(struct work_struct *work); -static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); +void cpuset_callback_unlock_irq(void) +{ + spin_unlock_irq(&callback_lock); +} + +static struct workqueue_struct *cpuset_migrate_mm_wq; static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); @@ -383,7 +309,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) { if (!cpusets_insane_config() && movable_only_nodes(nodes)) { - static_branch_enable(&cpusets_insane_config_key); + static_branch_enable_cpuslocked(&cpusets_insane_config_key); pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" "Cpuset allocations might fail even with a lot of memory available.\n", nodemask_pr_args(nodes)); @@ -391,6 +317,32 @@ static inline void check_insane_mems_config(nodemask_t *nodes) } /* + * decrease cs->attach_in_progress. + * wake_up cpuset_attach_wq if cs->attach_in_progress==0. + */ +static inline void dec_attach_in_progress_locked(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +} + +static inline void dec_attach_in_progress(struct cpuset *cs) +{ + mutex_lock(&cpuset_mutex); + dec_attach_in_progress_locked(cs); + mutex_unlock(&cpuset_mutex); +} + +static inline bool cpuset_v2(void) +{ + return !IS_ENABLED(CONFIG_CPUSETS_V1) || + cgroup_subsys_on_dfl(cpuset_cgrp_subsys); +} + +/* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. @@ -400,10 +352,67 @@ static inline void check_insane_mems_config(nodemask_t *nodes) */ static inline bool is_in_v2_mode(void) { - return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + return cpuset_v2() || (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + +/** + * partition_is_populated - check if partition has tasks + * @cs: partition root to be checked + * @excluded_child: a child cpuset to be excluded in task checking + * Return: true if there are tasks, false otherwise + * + * @cs should be a valid partition root or going to become a partition root. + * @excluded_child should be non-NULL when this cpuset is going to become a + * partition itself. + * + * Note that a remote partition is not allowed underneath a valid local + * or remote partition. So if a non-partition root child is populated, + * the whole partition is considered populated. + */ +static inline bool partition_is_populated(struct cpuset *cs, + struct cpuset *excluded_child) +{ + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; + + /* + * We cannot call cs_is_populated(cs) directly, as + * nr_populated_domain_children may include populated + * csets from descendants that are partitions. + */ + if (cs->css.cgroup->nr_populated_csets || + cs->attach_in_progress) + return true; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + if (cp == cs || cp == excluded_child) + continue; + + if (is_partition_valid(cp)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + if (cpuset_is_populated(cp)) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + return false; +} + /* * Return in pmask the portion of a task's cpusets's cpus_allowed that * are online and are capable of running the task. If none are found, @@ -411,38 +420,26 @@ static inline bool is_in_v2_mode(void) * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_mask. + * of cpu_active_mask. * - * Call with callback_lock or cpuset_rwsem held. + * Call with callback_lock or cpuset_mutex held. */ -static void guarantee_online_cpus(struct task_struct *tsk, +static void guarantee_active_cpus(struct task_struct *tsk, struct cpumask *pmask) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); struct cpuset *cs; - if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) - cpumask_copy(pmask, cpu_online_mask); + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) + cpumask_copy(pmask, cpu_active_mask); rcu_read_lock(); cs = task_cs(tsk); - while (!cpumask_intersects(cs->effective_cpus, pmask)) { + while (!cpumask_intersects(cs->effective_cpus, pmask)) cs = parent_cs(cs); - if (unlikely(!cs)) { - /* - * The top cpuset doesn't have any online cpu as a - * consequence of a race between cpuset_hotplug_work - * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to be - * identical to cpu_online_mask. - */ - goto out_unlock; - } - } - cpumask_and(pmask, pmask, cs->effective_cpus); -out_unlock: + cpumask_and(pmask, pmask, cs->effective_cpus); rcu_read_unlock(); } @@ -455,7 +452,7 @@ out_unlock: * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_lock or cpuset_rwsem held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { @@ -464,119 +461,105 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } -/* - * update task's spread flag if cpuset's page/slab spread flag is set +/** + * alloc_cpumasks - Allocate an array of cpumask variables + * @pmasks: Pointer to array of cpumask_var_t pointers + * @size: Number of cpumasks to allocate + * Return: 0 if successful, -ENOMEM otherwise. * - * Call with callback_lock or cpuset_rwsem held. + * Allocates @size cpumasks and initializes them to empty. Returns 0 on + * success, -ENOMEM on allocation failure. On failure, any previously + * allocated cpumasks are freed. */ -static void cpuset_update_task_spread_flag(struct cpuset *cs, - struct task_struct *tsk) +static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) { - if (is_spread_page(cs)) - task_set_spread_page(tsk); - else - task_clear_spread_page(tsk); - - if (is_spread_slab(cs)) - task_set_spread_slab(tsk); - else - task_clear_spread_slab(tsk); -} + int i; -/* - * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? - * - * One cpuset is a subset of another if all its allowed CPUs and - * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_rwsem. - */ - -static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) -{ - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && - nodes_subset(p->mems_allowed, q->mems_allowed) && - is_cpu_exclusive(p) <= is_cpu_exclusive(q) && - is_mem_exclusive(p) <= is_mem_exclusive(q); + for (i = 0; i < size; i++) { + if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) { + while (--i >= 0) + free_cpumask_var(*pmasks[i]); + return -ENOMEM; + } + } + return 0; } /** - * alloc_cpumasks - allocate three cpumasks for cpuset - * @cs: the cpuset that have cpumasks to be allocated. - * @tmp: the tmpmasks structure pointer - * Return: 0 if successful, -ENOMEM otherwise. - * - * Only one of the two input arguments should be non-NULL. + * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. + * @tmp: Pointer to tmpmasks structure to populate + * Return: 0 on success, -ENOMEM on allocation failure */ -static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline int alloc_tmpmasks(struct tmpmasks *tmp) { - cpumask_var_t *pmask1, *pmask2, *pmask3; - - if (cs) { - pmask1 = &cs->cpus_allowed; - pmask2 = &cs->effective_cpus; - pmask3 = &cs->subparts_cpus; - } else { - pmask1 = &tmp->new_cpus; - pmask2 = &tmp->addmask; - pmask3 = &tmp->delmask; - } - - if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) - goto free_one; - - if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) - goto free_two; - - return 0; + /* + * Array of pointers to the three cpumask_var_t fields in tmpmasks. + * Note: Array size must match actual number of masks (3) + */ + cpumask_var_t *pmask[3] = { + &tmp->new_cpus, + &tmp->addmask, + &tmp->delmask + }; -free_two: - free_cpumask_var(*pmask2); -free_one: - free_cpumask_var(*pmask1); - return -ENOMEM; + return alloc_cpumasks(pmask, ARRAY_SIZE(pmask)); } /** - * free_cpumasks - free cpumasks in a tmpmasks structure - * @cs: the cpuset that have cpumasks to be free. + * free_tmpmasks - free cpumasks in a tmpmasks structure * @tmp: the tmpmasks structure pointer */ -static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline void free_tmpmasks(struct tmpmasks *tmp) { - if (cs) { - free_cpumask_var(cs->cpus_allowed); - free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->subparts_cpus); - } - if (tmp) { - free_cpumask_var(tmp->new_cpus); - free_cpumask_var(tmp->addmask); - free_cpumask_var(tmp->delmask); - } + if (!tmp) + return; + + free_cpumask_var(tmp->new_cpus); + free_cpumask_var(tmp->addmask); + free_cpumask_var(tmp->delmask); } /** - * alloc_trial_cpuset - allocate a trial cpuset - * @cs: the cpuset that the trial cpuset duplicates + * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset + * @cs: Source cpuset to duplicate (NULL for a fresh allocation) + * + * Creates a new cpuset by either: + * 1. Duplicating an existing cpuset (if @cs is non-NULL), or + * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) + * + * Return: Pointer to newly allocated cpuset on success, NULL on failure */ -static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) +static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) { struct cpuset *trial; - trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + /* Allocate base structure */ + trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : + kzalloc(sizeof(*cs), GFP_KERNEL); if (!trial) return NULL; - if (alloc_cpumasks(trial, NULL)) { + /* Setup cpumask pointer array */ + cpumask_var_t *pmask[4] = { + &trial->cpus_allowed, + &trial->effective_cpus, + &trial->effective_xcpus, + &trial->exclusive_cpus + }; + + if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) { kfree(trial); return NULL; } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); - cpumask_copy(trial->effective_cpus, cs->effective_cpus); + /* Copy masks if duplicating */ + if (cs) { + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + } + return trial; } @@ -586,10 +569,82 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static inline void free_cpuset(struct cpuset *cs) { - free_cpumasks(cs, NULL); + free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->effective_cpus); + free_cpumask_var(cs->effective_xcpus); + free_cpumask_var(cs->exclusive_cpus); kfree(cs); } +/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) +{ + return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed + : cs->exclusive_cpus; +} + +static inline bool xcpus_empty(struct cpuset *cs) +{ + return cpumask_empty(cs->cpus_allowed) && + cpumask_empty(cs->exclusive_cpus); +} + +/* + * cpusets_are_exclusive() - check if two cpusets are exclusive + * + * Return true if exclusive, false if not + */ +static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) +{ + struct cpumask *xcpus1 = user_xcpus(cs1); + struct cpumask *xcpus2 = user_xcpus(cs2); + + if (cpumask_intersects(xcpus1, xcpus2)) + return false; + return true; +} + +/** + * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * Conflict detection rules: + * 1. If either cpuset is CPU exclusive, they must be mutually exclusive + * 2. exclusive_cpus masks cannot intersect between cpusets + * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + */ +static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + /* If either cpuset is exclusive, check if they are mutually exclusive */ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return !cpusets_are_exclusive(cs1, cs2); + + /* Exclusive_cpus cannot intersect */ + if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) + return true; + + /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ + if (!cpumask_empty(cs1->cpus_allowed) && + cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) + return true; + + if (!cpumask_empty(cs2->cpus_allowed) && + cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + return true; + + return false; +} + +static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2))) + return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); + return false; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -597,7 +652,7 @@ static inline void free_cpuset(struct cpuset *cs) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_rwsem held. + * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -614,42 +669,27 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; - int ret; - - /* The checks don't apply to root cpuset */ - if (cur == &top_cpuset) - return 0; + int ret = 0; rcu_read_lock(); - par = parent_cs(cur); - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode()) + ret = cpuset1_validate_change(cur, trial); + if (ret) goto out; - /* - * If either I or some sibling (!= me) is exclusive, we can't - * overlap - */ - ret = -EINVAL; - cpuset_for_each_child(c, css, par) { - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - goto out; - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) - goto out; - } + /* Remaining checks don't apply to root cpuset */ + if (cur == &top_cpuset) + goto out; + + par = parent_cs(cur); /* * Cpusets with tasks - existing or newly being attached - can't * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { + if (cpuset_is_populated(cur)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; @@ -660,14 +700,40 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* * We can't shrink if we won't have enough room for SCHED_DEADLINE - * tasks. + * tasks. This check is not done when scheduling is disabled as the + * users should know what they are doing. + * + * For v1, effective_cpus == cpus_allowed & user_xcpus() returns + * cpus_allowed. + * + * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only + * for non-isolated partition root. At this point, the target + * effective_cpus isn't computed yet. user_xcpus() is the best + * approximation. + * + * TBD: May need to precompute the real effective_cpus here in case + * incorrect scheduling of SCHED_DEADLINE tasks in a partition + * becomes an issue. */ ret = -EBUSY; - if (is_cpu_exclusive(cur) && - !cpuset_cpumask_can_shrink(cur->cpus_allowed, - trial->cpus_allowed)) + if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) && + !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial))) goto out; + /* + * If either I or some sibling (!= me) is exclusive, we can't + * overlap. exclusive_cpus cannot overlap with each other if set. + */ + ret = -EINVAL; + cpuset_for_each_child(c, css, par) { + if (c == cur) + continue; + if (cpus_excl_conflict(trial, c)) + goto out; + if (mems_excl_conflict(trial, c)) + goto out; + } + ret = 0; out: rcu_read_unlock(); @@ -712,7 +778,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } -/* Must be called with cpuset_rwsem held. */ +/* Must be called with cpuset_mutex held. */ static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ @@ -738,7 +804,7 @@ static inline int nr_cpusets(void) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cpuset_rwsem held. + * Must be called with cpuset_mutex held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a @@ -759,18 +825,15 @@ static inline int nr_cpusets(void) * were changed (added or removed.) * * Finding the best partition (set of domains): - * The triple nested loops below over i, j, k scan over the - * load balanced cpusets (using the array of cpuset pointers in - * csa[]) looking for pairs of cpusets that have overlapping - * cpus_allowed, but which don't have the same 'pn' partition - * number and gives them in the same partition number. It keeps - * looping on the 'restart' label until it can no longer find - * any such pairs. - * - * The union of the cpus_allowed masks from the set of - * all cpusets having the same 'pn' value then form the one - * element of the partition (one sched domain) to be passed to - * partition_sched_domains(). + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). + * */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) @@ -778,20 +841,23 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ - int i, j, k; /* indices for partition finding loops */ + int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); + bool cgrpv2 = cpuset_v2(); + int nslot_update; doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts_cpus) { + if (root_load_balance && cpumask_empty(subpartitions_cpus)) { +single_root_domain: ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -803,7 +869,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); goto done; } @@ -819,64 +885,81 @@ static int generate_sched_domains(cpumask_var_t **domains, cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; + + if (cgrpv2) + goto v2; + /* + * v1: * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. - * - * If root is load-balancing, we can skip @cp if it - * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_FLAG_DOMAIN)))) - continue; - - if (root_load_balance && - cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) + housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; - /* skip @cp's subtree if not a partition root */ - if (!is_partition_root(cp)) + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + +v2: + /* + * Only valid partition roots that are not isolated and with + * non-empty effective_cpus will be saved into csn[]. + */ + if ((cp->partition_root_state == PRS_ROOT) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* + * Skip @cp's subtree if not a partition root and has no + * exclusive CPUs to be granted to child cpusets. + */ + if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); + /* + * If there are only isolated partitions underneath the cgroup root, + * we can optimize out unneeded sched domains scanning. + */ + if (root_load_balance && (csn == 1)) + goto single_root_domain; + for (i = 0; i < csn; i++) - csa[i]->pn = i; - ndoms = csn; + uf_node_init(&csa[i]->node); -restart: - /* Find the best partition (set of sched domains) */ + /* Merge overlapping cpusets */ for (i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - int apn = a->pn; - - for (j = 0; j < csn; j++) { - struct cpuset *b = csa[j]; - int bpn = b->pn; - - if (apn != bpn && cpusets_overlap(a, b)) { - for (k = 0; k < csn; k++) { - struct cpuset *c = csa[k]; - - if (c->pn == bpn) - c->pn = apn; - } - ndoms--; /* one less element */ - goto restart; + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) { + /* + * Cgroup v2 shouldn't pass down overlapping + * partition root cpusets. + */ + WARN_ON_ONCE(cgrpv2); + uf_union(&csa[i]->node, &csa[j]->node); } } } + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. @@ -892,45 +975,48 @@ restart: dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL); - for (nslot = 0, i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - struct cpumask *dp; - int apn = a->pn; - - if (apn < 0) { - /* Skip completed partitions */ - continue; - } - - dp = doms[nslot]; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; + /* + * Cgroup v2 doesn't support domain attributes, just set all of them + * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a + * subset of HK_TYPE_DOMAIN housekeeping CPUs. + */ + if (cgrpv2) { + for (i = 0; i < ndoms; i++) { + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (csa[i] == &top_cpuset) + cpumask_and(doms[i], csa[i]->effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; } + goto done; + } - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; + for (nslot = 0, i = 0; i < csn; i++) { + nslot_update = 0; for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; + + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) - update_domain_attr_tree(dattr + nslot, b); - - /* Done with this partition */ - b->pn = -1; + update_domain_attr_tree(dattr + nslot, csa[j]); } } - nslot++; + if (nslot_update) + nslot++; } BUG_ON(nslot != ndoms); @@ -949,11 +1035,14 @@ done: return ndoms; } -static void update_tasks_root_domain(struct cpuset *cs) +static void dl_update_tasks_root_domain(struct cpuset *cs) { struct css_task_iter it; struct task_struct *task; + if (cs->nr_deadline_tasks == 0) + return; + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) @@ -962,22 +1051,25 @@ static void update_tasks_root_domain(struct cpuset *cs) css_task_iter_end(&it); } -static void rebuild_root_domains(void) +void dl_rebuild_rd_accounting(void) { struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; + int cpu; + u64 cookie = ++dl_cookie; - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); rcu_read_lock(); - /* - * Clear default root domain DL accounting, it will be computed again - * if a task belongs to it. - */ - dl_clear_root_domain(&def_root_domain); + for_each_possible_cpu(cpu) { + if (dl_bw_visited(cpu, cookie)) + continue; + + dl_clear_root_domain_cpu(cpu); + } cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { @@ -990,7 +1082,7 @@ static void rebuild_root_domains(void) rcu_read_unlock(); - update_tasks_root_domain(cs); + dl_update_tasks_root_domain(cs); rcu_read_lock(); css_put(&cs->css); @@ -998,16 +1090,6 @@ static void rebuild_root_domains(void) rcu_read_unlock(); } -static void -partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ - mutex_lock(&sched_domains_mutex); - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - rebuild_root_domains(); - mutex_unlock(&sched_domains_mutex); -} - /* * Rebuild scheduler domains. * @@ -1017,9 +1099,9 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_rwsem held. Takes cpus_read_lock(). + * Call with cpuset_mutex held. Takes cpus_read_lock(). */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; @@ -1028,18 +1110,19 @@ static void rebuild_sched_domains_locked(void) int ndoms; lockdep_assert_cpus_held(); - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); + force_sd_rebuild = false; /* * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * Anyways, cpuset_handle_hotplug() will rebuild sched domains. * * With no CPUs in any subpartitions, top_cpuset's effective CPUs * should be the same as the active CPUs, so checking only top_cpuset * is enough to detect racing CPU offlines. */ - if (!top_cpuset.nr_subparts_cpus && + if (cpumask_empty(subpartitions_cpus) && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) return; @@ -1048,10 +1131,10 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts_cpus) { + if (!cpumask_empty(subpartitions_cpus)) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!is_partition_root(cs)) { + if (!is_partition_valid(cs)) { pos_css = css_rightmost_descendant(pos_css); continue; } @@ -1068,39 +1151,75 @@ static void rebuild_sched_domains_locked(void) ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); + partition_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ +static void rebuild_sched_domains_cpuslocked(void) +{ + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_locked(); + mutex_unlock(&cpuset_mutex); +} + void rebuild_sched_domains(void) { cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); - rebuild_sched_domains_locked(); - percpu_up_write(&cpuset_rwsem); + rebuild_sched_domains_cpuslocked(); cpus_read_unlock(); } +void cpuset_reset_sched_domains(void) +{ + mutex_lock(&cpuset_mutex); + partition_sched_domains(1, NULL, NULL); + mutex_unlock(&cpuset_mutex); +} + /** - * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. + * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the - * effective cpuset's. As this function is called with cpuset_rwsem held, + * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. + * + * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus + * to make sure all offline CPUs are also included as hotplug code won't + * update cpumasks for tasks in top_cpuset. + * + * As task_cpu_possible_mask() can be task dependent in arm64, we have to + * do cpu masking per task instead of doing it once for all. */ -static void update_tasks_cpumask(struct cpuset *cs) +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; + bool top_cs = cs == &top_cpuset; css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) - set_cpus_allowed_ptr(task, cs->effective_cpus); + while ((task = css_task_iter_next(&it))) { + const struct cpumask *possible_mask = task_cpu_possible_mask(task); + + if (top_cs) { + /* + * PF_NO_SETAFFINITY tasks are ignored. + * All per cpu kthreads should have PF_NO_SETAFFINITY + * flag set, see kthread_set_per_cpu(). + */ + if (task->flags & PF_NO_SETAFFINITY) + continue; + cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); + } else { + cpumask_and(new_cpus, possible_mask, cs->effective_cpus); + } + set_cpus_allowed_ptr(task, new_cpus); + } css_task_iter_end(&it); } @@ -1110,384 +1229,1137 @@ static void update_tasks_cpumask(struct cpuset *cs) * @cs: the cpuset the need to recompute the new effective_cpus mask * @parent: the parent cpuset * - * If the parent has subpartition CPUs, include them in the list of - * allowable CPUs in computing the new effective_cpus mask. Since offlined - * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask - * to mask those out. + * The result is valid only if the given cpuset isn't a partition root. */ static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { - if (parent->nr_subparts_cpus) { - cpumask_or(new_cpus, parent->effective_cpus, - parent->subparts_cpus); - cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); - cpumask_and(new_cpus, new_cpus, cpu_active_mask); + cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); +} + +/* + * Commands for update_parent_effective_cpumask + */ +enum partition_cmd { + partcmd_enable, /* Enable partition root */ + partcmd_enablei, /* Enable isolated partition root */ + partcmd_disable, /* Disable partition root */ + partcmd_update, /* Update parent's effective_cpus */ + partcmd_invalidate, /* Make partition invalid */ +}; + +static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, + struct tmpmasks *tmp); + +/* + * Update partition exclusive flag + * + * Return: 0 if successful, an error code otherwise + */ +static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs) +{ + bool exclusive = (new_prs > PRS_MEMBER); + + if (exclusive && !is_cpu_exclusive(cs)) { + if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1)) + return PERR_NOTEXCL; + } else if (!exclusive && is_cpu_exclusive(cs)) { + /* Turning off CS_CPU_EXCLUSIVE will not return error */ + cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0); + } + return 0; +} + +/* + * Update partition load balance flag and/or rebuild sched domain + * + * Changing load balance flag will automatically call + * rebuild_sched_domains_locked(). + * This function is for cgroup v2 only. + */ +static void update_partition_sd_lb(struct cpuset *cs, int old_prs) +{ + int new_prs = cs->partition_root_state; + bool rebuild_domains = (new_prs > 0) || (old_prs > 0); + bool new_lb; + + /* + * If cs is not a valid partition root, the load balance state + * will follow its parent. + */ + if (new_prs > 0) { + new_lb = (new_prs != PRS_ISOLATED); } else { - cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); + new_lb = is_sched_load_balance(parent_cs(cs)); } + if (new_lb != !!is_sched_load_balance(cs)) { + rebuild_domains = true; + if (new_lb) + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + } + + if (rebuild_domains) + cpuset_force_rebuild(); } /* - * Commands for update_parent_subparts_cpumask + * tasks_nocpu_error - Return true if tasks will have no effective_cpus */ -enum subparts_cmd { - partcmd_enable, /* Enable partition root */ - partcmd_disable, /* Disable partition root */ - partcmd_update, /* Update parent's subparts_cpus */ -}; +static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, + struct cpumask *xcpus) +{ + /* + * A populated partition (cs or parent) can't have empty effective_cpus + */ + return (cpumask_subset(parent->effective_cpus, xcpus) && + partition_is_populated(parent, cs)) || + (!cpumask_intersects(xcpus, cpu_active_mask) && + partition_is_populated(cs, NULL)); +} + +static void reset_partition_data(struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(cs); + + if (!cpuset_v2()) + return; + + lockdep_assert_held(&callback_lock); + + if (cpumask_empty(cs->exclusive_cpus)) { + cpumask_clear(cs->effective_xcpus); + if (is_cpu_exclusive(cs)) + clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); + } + if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) + cpumask_copy(cs->effective_cpus, parent->effective_cpus); +} + +/* + * isolated_cpus_update - Update the isolated_cpus mask + * @old_prs: old partition_root_state + * @new_prs: new partition_root_state + * @xcpus: exclusive CPUs with state change + */ +static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) +{ + WARN_ON_ONCE(old_prs == new_prs); + if (new_prs == PRS_ISOLATED) + cpumask_or(isolated_cpus, isolated_cpus, xcpus); + else + cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); + + isolated_cpus_updating = true; +} + +/* + * partition_xcpus_add - Add new exclusive CPUs to partition + * @new_prs: new partition_root_state + * @parent: parent cpuset + * @xcpus: exclusive CPUs to be added + * + * Remote partition if parent == NULL + */ +static void partition_xcpus_add(int new_prs, struct cpuset *parent, + struct cpumask *xcpus) +{ + WARN_ON_ONCE(new_prs < 0); + lockdep_assert_held(&callback_lock); + if (!parent) + parent = &top_cpuset; + + + if (parent == &top_cpuset) + cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); + + if (new_prs != parent->partition_root_state) + isolated_cpus_update(parent->partition_root_state, new_prs, + xcpus); + + cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); +} + +/* + * partition_xcpus_del - Remove exclusive CPUs from partition + * @old_prs: old partition_root_state + * @parent: parent cpuset + * @xcpus: exclusive CPUs to be removed + * + * Remote partition if parent == NULL + */ +static void partition_xcpus_del(int old_prs, struct cpuset *parent, + struct cpumask *xcpus) +{ + WARN_ON_ONCE(old_prs < 0); + lockdep_assert_held(&callback_lock); + if (!parent) + parent = &top_cpuset; + + if (parent == &top_cpuset) + cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); + + if (old_prs != parent->partition_root_state) + isolated_cpus_update(old_prs, parent->partition_root_state, + xcpus); + + cpumask_and(xcpus, xcpus, cpu_active_mask); + cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); +} + +/* + * isolated_cpus_can_update - check for isolated & nohz_full conflicts + * @add_cpus: cpu mask for cpus that are going to be isolated + * @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL + * Return: false if there is conflict, true otherwise + * + * If nohz_full is enabled and we have isolated CPUs, their combination must + * still leave housekeeping CPUs. + * + * TBD: Should consider merging this function into + * prstate_housekeeping_conflict(). + */ +static bool isolated_cpus_can_update(struct cpumask *add_cpus, + struct cpumask *del_cpus) +{ + cpumask_var_t full_hk_cpus; + int res = true; + + if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE)) + return true; + + if (del_cpus && cpumask_weight_and(del_cpus, + housekeeping_cpumask(HK_TYPE_KERNEL_NOISE))) + return true; + + if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL)) + return false; + + cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE), + housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus); + cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask); + if (!cpumask_weight_andnot(full_hk_cpus, add_cpus)) + res = false; + + free_cpumask_var(full_hk_cpus); + return res; +} + +/* + * prstate_housekeeping_conflict - check for partition & housekeeping conflicts + * @prstate: partition root state to be checked + * @new_cpus: cpu mask + * Return: true if there is conflict, false otherwise + * + * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * isolated partition. + */ +static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) +{ + if (!have_boot_isolcpus) + return false; + + if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) + return true; + + return false; +} + +/* + * update_isolation_cpumasks - Update external isolation related CPU masks + * + * The following external CPU masks will be updated if necessary: + * - workqueue unbound cpumask + */ +static void update_isolation_cpumasks(void) +{ + int ret; + + if (!isolated_cpus_updating) + return; + + lockdep_assert_cpus_held(); + + ret = workqueue_unbound_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); + + ret = tmigr_isolated_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); + + isolated_cpus_updating = false; +} /** - * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset - * @cpuset: The cpuset that requests change in partition root state - * @cmd: Partition root state change command - * @newmask: Optional new cpumask for partcmd_update - * @tmp: Temporary addmask and delmask - * Return: 0, 1 or an error code - * - * For partcmd_enable, the cpuset is being transformed from a non-partition - * root to a partition root. The cpus_allowed mask of the given cpuset will - * be put into parent's subparts_cpus and taken away from parent's - * effective_cpus. The function will return 0 if all the CPUs listed in - * cpus_allowed can be granted or an error code will be returned. - * - * For partcmd_disable, the cpuset is being transofrmed from a partition - * root back to a non-partition root. Any CPUs in cpus_allowed that are in - * parent's subparts_cpus will be taken away from that cpumask and put back - * into parent's effective_cpus. 0 should always be returned. - * - * For partcmd_update, if the optional newmask is specified, the cpu - * list is to be changed from cpus_allowed to newmask. Otherwise, - * cpus_allowed is assumed to remain the same. The cpuset should either - * be a partition root or an invalid partition root. The partition root - * state may change if newmask is NULL and none of the requested CPUs can - * be granted by the parent. The function will return 1 if changes to - * parent's subparts_cpus and effective_cpus happen or 0 otherwise. - * Error code should only be returned when newmask is non-NULL. - * - * The partcmd_enable and partcmd_disable commands are used by - * update_prstate(). The partcmd_update command is used by - * update_cpumasks_hier() with newmask NULL and update_cpumask() with - * newmask set. - * - * The checking is more strict when enabling partition root than the - * other two commands. - * - * Because of the implicit cpu exclusive nature of a partition root, - * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). The validate_change() - * function will also prevent any changes to the cpu list if it is not - * a superset of children's cpu lists. - */ -static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, - struct cpumask *newmask, - struct tmpmasks *tmp) -{ - struct cpuset *parent = parent_cs(cpuset); - int adding; /* Moving cpus from effective_cpus to subparts_cpus */ - int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ - int old_prs, new_prs; - bool part_error = false; /* Partition error? */ + * cpuset_cpu_is_isolated - Check if the given CPU is isolated + * @cpu: the CPU number to be checked + * Return: true if CPU is used in an isolated partition, false otherwise + */ +bool cpuset_cpu_is_isolated(int cpu) +{ + return cpumask_test_cpu(cpu, isolated_cpus); +} +EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); - percpu_rwsem_assert_held(&cpuset_rwsem); +/** + * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets + * @parent: Parent cpuset containing all siblings + * @cs: Current cpuset (will be skipped) + * @excpus: exclusive effective CPU mask to modify + * + * This function ensures the given @excpus mask doesn't include any CPUs that + * are exclusively allocated to sibling cpusets. It walks through all siblings + * of @cs under @parent and removes their exclusive CPUs from @excpus. + */ +static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, + struct cpumask *excpus) +{ + struct cgroup_subsys_state *css; + struct cpuset *sibling; + int retval = 0; + + if (cpumask_empty(excpus)) + return retval; /* - * The parent must be a partition root. - * The new cpumask, if present, or the current cpus_allowed must - * not be empty. + * Exclude exclusive CPUs from siblings */ - if (!is_partition_root(parent) || - (newmask && cpumask_empty(newmask)) || - (!newmask && cpumask_empty(cpuset->cpus_allowed))) - return -EINVAL; + rcu_read_lock(); + cpuset_for_each_child(sibling, css, parent) { + if (sibling == cs) + continue; + + if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { + cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); + retval++; + continue; + } + if (cpumask_intersects(excpus, sibling->effective_xcpus)) { + cpumask_andnot(excpus, excpus, sibling->effective_xcpus); + retval++; + } + } + rcu_read_unlock(); + + return retval; +} + +/* + * compute_excpus - compute effective exclusive CPUs + * @cs: cpuset + * @xcpus: effective exclusive CPUs value to be set + * Return: 0 if there is no sibling conflict, > 0 otherwise + * + * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets + * and exclude their exclusive_cpus or effective_xcpus as well. + */ +static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) +{ + struct cpuset *parent = parent_cs(cs); + cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus); + + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + +/* + * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset + * @trialcs: The trial cpuset containing the proposed new configuration + * @cs: The original cpuset that the trial configuration is based on + * Return: 0 if successful with no sibling conflict, >0 if a conflict is found + * + * Computes the effective_xcpus for a trial configuration. @cs is provided to represent + * the real cs. + */ +static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(trialcs); + struct cpumask *excpus = trialcs->effective_xcpus; + + /* trialcs is member, cpuset.cpus has no impact to excpus */ + if (cs_is_member(cs)) + cpumask_and(excpus, trialcs->exclusive_cpus, + parent->effective_xcpus); + else + cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + +static inline bool is_remote_partition(struct cpuset *cs) +{ + return cs->remote_partition; +} + +static inline bool is_local_partition(struct cpuset *cs) +{ + return is_partition_valid(cs) && !is_remote_partition(cs); +} + +/* + * remote_partition_enable - Enable current cpuset as a remote partition root + * @cs: the cpuset to update + * @new_prs: new partition_root_state + * @tmp: temporary masks + * Return: 0 if successful, errcode if error + * + * Enable the current cpuset to become a remote partition root taking CPUs + * directly from the top cpuset. cpuset_mutex must be held by the caller. + */ +static int remote_partition_enable(struct cpuset *cs, int new_prs, + struct tmpmasks *tmp) +{ /* - * Enabling/disabling partition root is not allowed if there are - * online children. + * The user must have sysadmin privilege. */ - if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css)) - return -EBUSY; + if (!capable(CAP_SYS_ADMIN)) + return PERR_ACCESS; /* - * Enabling partition root is not allowed if not all the CPUs - * can be granted from parent's effective_cpus or at least one - * CPU will be left after that. + * The requested exclusive_cpus must not be allocated to other + * partitions and it can't use up all the root's effective_cpus. + * + * The effective_xcpus mask can contain offline CPUs, but there must + * be at least one or more online CPUs present before it can be enabled. + * + * Note that creating a remote partition with any local partition root + * above it or remote partition root underneath it is not allowed. */ - if ((cmd == partcmd_enable) && - (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) || - cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus))) - return -EINVAL; + compute_excpus(cs, tmp->new_cpus); + WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); + if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || + cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) + return PERR_INVCPUS; + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->new_cpus, NULL)) || + prstate_housekeeping_conflict(new_prs, tmp->new_cpus)) + return PERR_HKEEPING; + + spin_lock_irq(&callback_lock); + partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + cs->remote_partition = true; + cpumask_copy(cs->effective_xcpus, tmp->new_cpus); + spin_unlock_irq(&callback_lock); + update_isolation_cpumasks(); + cpuset_force_rebuild(); + cs->prs_err = 0; + + /* + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. + */ + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + return 0; +} + +/* + * remote_partition_disable - Remove current cpuset from remote partition list + * @cs: the cpuset to update + * @tmp: temporary masks + * + * The effective_cpus is also updated. + * + * cpuset_mutex must be held by the caller. + */ +static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) +{ + WARN_ON_ONCE(!is_remote_partition(cs)); + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); + + spin_lock_irq(&callback_lock); + cs->remote_partition = false; + partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus); + if (cs->prs_err) + cs->partition_root_state = -cs->partition_root_state; + else + cs->partition_root_state = PRS_MEMBER; + /* effective_xcpus may need to be changed */ + compute_excpus(cs, cs->effective_xcpus); + reset_partition_data(cs); + spin_unlock_irq(&callback_lock); + update_isolation_cpumasks(); + cpuset_force_rebuild(); + + /* + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. + */ + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); +} + +/* + * remote_cpus_update - cpus_exclusive change of remote partition + * @cs: the cpuset to be updated + * @xcpus: the new exclusive_cpus mask, if non-NULL + * @excpus: the new effective_xcpus mask + * @tmp: temporary masks + * + * top_cpuset and subpartitions_cpus will be updated or partition can be + * invalidated. + */ +static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, + struct cpumask *excpus, struct tmpmasks *tmp) +{ + bool adding, deleting; + int prs = cs->partition_root_state; + + if (WARN_ON_ONCE(!is_remote_partition(cs))) + return; + + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); + + if (cpumask_empty(excpus)) { + cs->prs_err = PERR_CPUSEMPTY; + goto invalidate; + } + + adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus); + deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus); + + /* + * Additions of remote CPUs is only allowed if those CPUs are + * not allocated to other partitions and there are effective_cpus + * left in the top cpuset. + */ + if (adding) { + WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus)); + if (!capable(CAP_SYS_ADMIN)) + cs->prs_err = PERR_ACCESS; + else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) + cs->prs_err = PERR_NOCPUS; + else if ((prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + cs->prs_err = PERR_HKEEPING; + if (cs->prs_err) + goto invalidate; + } + + spin_lock_irq(&callback_lock); + if (adding) + partition_xcpus_add(prs, NULL, tmp->addmask); + if (deleting) + partition_xcpus_del(prs, NULL, tmp->delmask); /* - * A cpumask update cannot make parent's effective_cpus become empty. + * Need to update effective_xcpus and exclusive_cpus now as + * update_sibling_cpumasks() below may iterate back to the same cs. + */ + cpumask_copy(cs->effective_xcpus, excpus); + if (xcpus) + cpumask_copy(cs->exclusive_cpus, xcpus); + spin_unlock_irq(&callback_lock); + update_isolation_cpumasks(); + if (adding || deleting) + cpuset_force_rebuild(); + + /* + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. + */ + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + return; + +invalidate: + remote_partition_disable(cs, tmp); +} + +/** + * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset + * @cs: The cpuset that requests change in partition root state + * @cmd: Partition root state change command + * @newmask: Optional new cpumask for partcmd_update + * @tmp: Temporary addmask and delmask + * Return: 0 or a partition root state error code + * + * For partcmd_enable*, the cpuset is being transformed from a non-partition + * root to a partition root. The effective_xcpus (cpus_allowed if + * effective_xcpus not set) mask of the given cpuset will be taken away from + * parent's effective_cpus. The function will return 0 if all the CPUs listed + * in effective_xcpus can be granted or an error code will be returned. + * + * For partcmd_disable, the cpuset is being transformed from a partition + * root back to a non-partition root. Any CPUs in effective_xcpus will be + * given back to parent's effective_cpus. 0 will always be returned. + * + * For partcmd_update, if the optional newmask is specified, the cpu list is + * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is + * assumed to remain the same. The cpuset should either be a valid or invalid + * partition root. The partition root state may change from valid to invalid + * or vice versa. An error code will be returned if transitioning from + * invalid to valid violates the exclusivity rule. + * + * For partcmd_invalidate, the current partition will be made invalid. + * + * The partcmd_enable* and partcmd_disable commands are used by + * update_prstate(). An error code may be returned and the caller will check + * for error. + * + * The partcmd_update command is used by update_cpumasks_hier() with newmask + * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used + * by update_cpumask() with NULL newmask. In both cases, the callers won't + * check for error and so partition_root_state and prs_err will be updated + * directly. + */ +static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, + struct cpumask *newmask, + struct tmpmasks *tmp) +{ + struct cpuset *parent = parent_cs(cs); + int adding; /* Adding cpus to parent's effective_cpus */ + int deleting; /* Deleting cpus from parent's effective_cpus */ + int old_prs, new_prs; + int part_error = PERR_NONE; /* Partition error? */ + struct cpumask *xcpus = user_xcpus(cs); + int parent_prs = parent->partition_root_state; + bool nocpu; + + lockdep_assert_held(&cpuset_mutex); + WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ + + /* + * new_prs will only be changed for the partcmd_update and + * partcmd_invalidate commands. */ adding = deleting = false; - old_prs = new_prs = cpuset->partition_root_state; - if (cmd == partcmd_enable) { - cpumask_copy(tmp->addmask, cpuset->cpus_allowed); - adding = true; + old_prs = new_prs = cs->partition_root_state; + + if (cmd == partcmd_invalidate) { + if (is_partition_invalid(cs)) + return 0; + + /* + * Make the current partition invalid. + */ + if (is_partition_valid(parent)) + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); + if (old_prs > 0) + new_prs = -old_prs; + + goto write_error; + } + + /* + * The parent must be a partition root. + * The new cpumask, if present, or the current cpus_allowed must + * not be empty. + */ + if (!is_partition_valid(parent)) { + return is_partition_invalid(parent) + ? PERR_INVPARENT : PERR_NOTPART; + } + if (!newmask && xcpus_empty(cs)) + return PERR_CPUSEMPTY; + + nocpu = tasks_nocpu_error(parent, cs, xcpus); + + if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { + /* + * Need to call compute_excpus() in case + * exclusive_cpus not set. Sibling conflict should only happen + * if exclusive_cpus isn't set. + */ + xcpus = tmp->delmask; + if (compute_excpus(cs, xcpus)) + WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; + + /* + * Enabling partition root is not allowed if its + * effective_xcpus is empty. + */ + if (cpumask_empty(xcpus)) + return PERR_INVCPUS; + + if (prstate_housekeeping_conflict(new_prs, xcpus)) + return PERR_HKEEPING; + + if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) && + !isolated_cpus_can_update(xcpus, NULL)) + return PERR_HKEEPING; + + if (tasks_nocpu_error(parent, cs, xcpus)) + return PERR_NOCPUS; + + /* + * This function will only be called when all the preliminary + * checks have passed. At this point, the following condition + * should hold. + * + * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus + * + * Warn if it is not the case. + */ + cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); + + deleting = true; } else if (cmd == partcmd_disable) { - deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, - parent->subparts_cpus); + /* + * May need to add cpus back to parent's effective_cpus + * (and maybe removed from subpartitions_cpus/isolated_cpus) + * for valid partition root. xcpus may contain CPUs that + * shouldn't be removed from the two global cpumasks. + */ + if (is_partition_valid(cs)) { + cpumask_copy(tmp->addmask, cs->effective_xcpus); + adding = true; + } + new_prs = PRS_MEMBER; } else if (newmask) { /* + * Empty cpumask is not allowed + */ + if (cpumask_empty(newmask)) { + part_error = PERR_CPUSEMPTY; + goto write_error; + } + + /* Check newmask again, whether cpus are available for parent/cs */ + nocpu |= tasks_nocpu_error(parent, cs, newmask); + + /* * partcmd_update with newmask: * - * delmask = cpus_allowed & ~newmask & parent->subparts_cpus - * addmask = newmask & parent->effective_cpus - * & ~parent->subparts_cpus + * Compute add/delete mask to/from effective_cpus + * + * For valid partition: + * addmask = exclusive_cpus & ~newmask + * & parent->effective_xcpus + * delmask = newmask & ~exclusive_cpus + * & parent->effective_xcpus + * + * For invalid partition: + * delmask = newmask & parent->effective_xcpus + * The partition may become valid soon. */ - cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask); - deleting = cpumask_and(tmp->delmask, tmp->delmask, - parent->subparts_cpus); + if (is_partition_invalid(cs)) { + adding = false; + deleting = cpumask_and(tmp->delmask, + newmask, parent->effective_xcpus); + } else { + cpumask_andnot(tmp->addmask, xcpus, newmask); + adding = cpumask_and(tmp->addmask, tmp->addmask, + parent->effective_xcpus); + + cpumask_andnot(tmp->delmask, newmask, xcpus); + deleting = cpumask_and(tmp->delmask, tmp->delmask, + parent->effective_xcpus); + } - cpumask_and(tmp->addmask, newmask, parent->effective_cpus); - adding = cpumask_andnot(tmp->addmask, tmp->addmask, - parent->subparts_cpus); /* - * Return error if the new effective_cpus could become empty. + * TBD: Invalidate a currently valid child root partition may + * still break isolated_cpus_can_update() rule if parent is an + * isolated partition. */ - if (adding && - cpumask_equal(parent->effective_cpus, tmp->addmask)) { - if (!deleting) - return -EINVAL; - /* - * As some of the CPUs in subparts_cpus might have - * been offlined, we need to compute the real delmask - * to confirm that. - */ - if (!cpumask_and(tmp->addmask, tmp->delmask, - cpu_active_mask)) - return -EINVAL; - cpumask_copy(tmp->addmask, parent->effective_cpus); + if (is_partition_valid(cs) && (old_prs != parent_prs)) { + if ((parent_prs == PRS_ROOT) && + /* Adding to parent means removing isolated CPUs */ + !isolated_cpus_can_update(tmp->delmask, tmp->addmask)) + part_error = PERR_HKEEPING; + if ((parent_prs == PRS_ISOLATED) && + /* Adding to parent means adding isolated CPUs */ + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + part_error = PERR_HKEEPING; + } + + /* + * The new CPUs to be removed from parent's effective CPUs + * must be present. + */ + if (deleting) { + cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); + } + + /* + * Make partition invalid if parent's effective_cpus could + * become empty and there are tasks in the parent. + */ + if (nocpu && (!adding || + !cpumask_intersects(tmp->addmask, cpu_active_mask))) { + part_error = PERR_NOCPUS; + deleting = false; + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); } } else { /* - * partcmd_update w/o newmask: + * partcmd_update w/o newmask * - * addmask = cpus_allowed & parent->effective_cpus + * delmask = effective_xcpus & parent->effective_cpus * - * Note that parent's subparts_cpus may have been - * pre-shrunk in case there is a change in the cpu list. - * So no deletion is needed. + * This can be called from: + * 1) update_cpumasks_hier() + * 2) cpuset_hotplug_update_tasks() + * + * Check to see if it can be transitioned from valid to + * invalid partition or vice versa. + * + * A partition error happens when parent has tasks and all + * its effective CPUs will have to be distributed out. */ - adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed, - parent->effective_cpus); - part_error = cpumask_equal(tmp->addmask, - parent->effective_cpus); + if (nocpu) { + part_error = PERR_NOCPUS; + if (is_partition_valid(cs)) + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); + } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) && + cpumask_subset(xcpus, parent->effective_xcpus)) { + struct cgroup_subsys_state *css; + struct cpuset *child; + bool exclusive = true; + + /* + * Convert invalid partition to valid has to + * pass the cpu exclusivity test. + */ + rcu_read_lock(); + cpuset_for_each_child(child, css, parent) { + if (child == cs) + continue; + if (!cpusets_are_exclusive(cs, child)) { + exclusive = false; + break; + } + } + rcu_read_unlock(); + if (exclusive) + deleting = cpumask_and(tmp->delmask, + xcpus, parent->effective_cpus); + else + part_error = PERR_NOTEXCL; + } } - if (cmd == partcmd_update) { - int prev_prs = cpuset->partition_root_state; +write_error: + if (part_error) + WRITE_ONCE(cs->prs_err, part_error); + if (cmd == partcmd_update) { /* - * Check for possible transition between PRS_ENABLED - * and PRS_ERROR. + * Check for possible transition between valid and invalid + * partition root. */ - switch (cpuset->partition_root_state) { - case PRS_ENABLED: + switch (cs->partition_root_state) { + case PRS_ROOT: + case PRS_ISOLATED: if (part_error) - new_prs = PRS_ERROR; + new_prs = -old_prs; break; - case PRS_ERROR: + case PRS_INVALID_ROOT: + case PRS_INVALID_ISOLATED: if (!part_error) - new_prs = PRS_ENABLED; + new_prs = -old_prs; break; } - /* - * Set part_error if previously in invalid state. - */ - part_error = (prev_prs == PRS_ERROR); - } - - if (!part_error && (new_prs == PRS_ERROR)) - return 0; /* Nothing need to be done */ - - if (new_prs == PRS_ERROR) { - /* - * Remove all its cpus from parent's subparts_cpus. - */ - adding = false; - deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, - parent->subparts_cpus); } if (!adding && !deleting && (new_prs == old_prs)) return 0; /* - * Change the parent's subparts_cpus. + * Transitioning between invalid to valid or vice versa may require + * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, + * validate_change() has already been successfully called and + * CPU lists in cs haven't been updated yet. So defer it to later. + */ + if ((old_prs != new_prs) && (cmd != partcmd_update)) { + int err = update_partition_exclusive_flag(cs, new_prs); + + if (err) + return err; + } + + /* + * Change the parent's effective_cpus & effective_xcpus (top cpuset + * only). + * * Newly added CPUs will be removed from effective_cpus and * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); - if (adding) { - cpumask_or(parent->subparts_cpus, - parent->subparts_cpus, tmp->addmask); - cpumask_andnot(parent->effective_cpus, - parent->effective_cpus, tmp->addmask); - } - if (deleting) { - cpumask_andnot(parent->subparts_cpus, - parent->subparts_cpus, tmp->delmask); - /* - * Some of the CPUs in subparts_cpus might have been offlined. - */ - cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask); - cpumask_or(parent->effective_cpus, - parent->effective_cpus, tmp->delmask); + if (old_prs != new_prs) + cs->partition_root_state = new_prs; + + /* + * Adding to parent's effective_cpus means deletion CPUs from cs + * and vice versa. + */ + if (adding) + partition_xcpus_del(old_prs, parent, tmp->addmask); + if (deleting) + partition_xcpus_add(new_prs, parent, tmp->delmask); + + spin_unlock_irq(&callback_lock); + update_isolation_cpumasks(); + + if ((old_prs != new_prs) && (cmd == partcmd_update)) + update_partition_exclusive_flag(cs, new_prs); + + if (adding || deleting) { + cpuset_update_tasks_cpumask(parent, tmp->addmask); + update_sibling_cpumasks(parent, cs, tmp); } - parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); + /* + * For partcmd_update without newmask, it is being called from + * cpuset_handle_hotplug(). Update the load balance flag and + * scheduling domain accordingly. + */ + if ((cmd == partcmd_update) && !newmask) + update_partition_sd_lb(cs, old_prs); - if (old_prs != new_prs) - cpuset->partition_root_state = new_prs; + notify_partition_change(cs, old_prs); + return 0; +} - spin_unlock_irq(&callback_lock); - notify_partition_change(cpuset, old_prs, new_prs); +/** + * compute_partition_effective_cpumask - compute effective_cpus for partition + * @cs: partition root cpuset + * @new_ecpus: previously computed effective_cpus to be updated + * + * Compute the effective_cpus of a partition root by scanning effective_xcpus + * of child partition roots and excluding their effective_xcpus. + * + * This has the side effect of invalidating valid child partition roots, + * if necessary. Since it is called from either cpuset_hotplug_update_tasks() + * or update_cpumasks_hier() where parent and children are modified + * successively, we don't need to call update_parent_effective_cpumask() + * and the child's effective_cpus will be updated in later iterations. + * + * Note that rcu_read_lock() is assumed to be held. + */ +static void compute_partition_effective_cpumask(struct cpuset *cs, + struct cpumask *new_ecpus) +{ + struct cgroup_subsys_state *css; + struct cpuset *child; + bool populated = partition_is_populated(cs, NULL); + + /* + * Check child partition roots to see if they should be + * invalidated when + * 1) child effective_xcpus not a subset of new + * excluisve_cpus + * 2) All the effective_cpus will be used up and cp + * has tasks + */ + compute_excpus(cs, new_ecpus); + cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); - return cmd == partcmd_update; + rcu_read_lock(); + cpuset_for_each_child(child, css, cs) { + if (!is_partition_valid(child)) + continue; + + /* + * There shouldn't be a remote partition underneath another + * partition root. + */ + WARN_ON_ONCE(is_remote_partition(child)); + child->prs_err = 0; + if (!cpumask_subset(child->effective_xcpus, + cs->effective_xcpus)) + child->prs_err = PERR_INVCPUS; + else if (populated && + cpumask_subset(new_ecpus, child->effective_xcpus)) + child->prs_err = PERR_NOCPUS; + + if (child->prs_err) { + int old_prs = child->partition_root_state; + + /* + * Invalidate child partition + */ + spin_lock_irq(&callback_lock); + make_partition_invalid(child); + spin_unlock_irq(&callback_lock); + notify_partition_change(child, old_prs); + continue; + } + cpumask_andnot(new_ecpus, new_ecpus, + child->effective_xcpus); + } + rcu_read_unlock(); } /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup + * @force: don't skip any descendant cpusets if set * * When configured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * - * Called with cpuset_rwsem held + * Called with cpuset_mutex held */ -static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) +static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, + bool force) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; - bool need_rebuild_sched_domains = false; int old_prs, new_prs; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); + bool remote = is_remote_partition(cp); + bool update_parent = false; - compute_effective_cpumask(tmp->new_cpus, cp, parent); + old_prs = new_prs = cp->partition_root_state; /* - * If it becomes empty, inherit the effective mask of the - * parent, which is guaranteed to have some CPUs. + * For child remote partition root (!= cs), we need to call + * remote_cpus_update() if effective_xcpus will be changed. + * Otherwise, we can skip the whole subtree. + * + * remote_cpus_update() will reuse tmp->new_cpus only after + * its value is being processed. */ - if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { - cpumask_copy(tmp->new_cpus, parent->effective_cpus); - if (!cp->use_parent_ecpus) { - cp->use_parent_ecpus = true; - parent->child_ecpus_count++; + if (remote && (cp != cs)) { + compute_excpus(cp, tmp->new_cpus); + if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; } - } else if (cp->use_parent_ecpus) { - cp->use_parent_ecpus = false; - WARN_ON_ONCE(!parent->child_ecpus_count); - parent->child_ecpus_count--; + rcu_read_unlock(); + remote_cpus_update(cp, NULL, tmp->new_cpus, tmp); + rcu_read_lock(); + + /* Remote partition may be invalidated */ + new_prs = cp->partition_root_state; + remote = (new_prs == old_prs); + } + + if (remote || (is_partition_valid(parent) && is_partition_valid(cp))) + compute_partition_effective_cpumask(cp, tmp->new_cpus); + else + compute_effective_cpumask(tmp->new_cpus, cp, parent); + + if (remote) + goto get_css; /* Ready to update cpuset data */ + + /* + * A partition with no effective_cpus is allowed as long as + * there is no task associated with it. Call + * update_parent_effective_cpumask() to check it. + */ + if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) { + update_parent = true; + goto update_parent_effective; } /* - * Skip the whole subtree if the cpumask remains the same - * and has no partition root state. + * If it becomes empty, inherit the effective mask of the + * parent, which is guaranteed to have some CPUs unless + * it is a partition root that has explicitly distributed + * out all its CPUs. */ - if (!cp->partition_root_state && - cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { + if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) + cpumask_copy(tmp->new_cpus, parent->effective_cpus); + + /* + * Skip the whole subtree if + * 1) the cpumask remains the same, + * 2) has no partition root state, + * 3) force flag not set, and + * 4) for v2 load balance state same as its parent. + */ + if (!cp->partition_root_state && !force && + cpumask_equal(tmp->new_cpus, cp->effective_cpus) && + (!cpuset_v2() || + (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; } +update_parent_effective: /* - * update_parent_subparts_cpumask() should have been called + * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call - * update_tasks_cpumask() again for tasks in the parent - * cpuset if the parent's subparts_cpus changes. + * cpuset_update_tasks_cpumask() again for tasks in the parent + * cpuset if the parent's effective_cpus changes. */ - old_prs = new_prs = cp->partition_root_state; if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { - case PRS_DISABLED: - /* - * If parent is not a partition root or an - * invalid partition root, clear its state - * and its CS_CPU_EXCLUSIVE flag. - */ - WARN_ON_ONCE(cp->partition_root_state - != PRS_ERROR); - new_prs = PRS_DISABLED; - - /* - * clear_bit() is an atomic operation and - * readers aren't interested in the state - * of CS_CPU_EXCLUSIVE anyway. So we can - * just update the flag without holding - * the callback_lock. - */ - clear_bit(CS_CPU_EXCLUSIVE, &cp->flags); - break; - - case PRS_ENABLED: - if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp)) - update_tasks_cpumask(parent); + case PRS_ROOT: + case PRS_ISOLATED: + update_parent = true; break; - case PRS_ERROR: + default: /* - * When parent is invalid, it has to be too. + * When parent is not a partition root or is + * invalid, child partition roots become + * invalid too. */ - new_prs = PRS_ERROR; + if (is_partition_valid(cp)) + new_prs = -cp->partition_root_state; + WRITE_ONCE(cp->prs_err, + is_partition_invalid(parent) + ? PERR_INVPARENT : PERR_NOTPART); break; } } - +get_css: if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); - spin_lock_irq(&callback_lock); - - cpumask_copy(cp->effective_cpus, tmp->new_cpus); - if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) { - cp->nr_subparts_cpus = 0; - cpumask_clear(cp->subparts_cpus); - } else if (cp->nr_subparts_cpus) { + if (update_parent) { + update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp); /* - * Make sure that effective_cpus & subparts_cpus - * are mutually exclusive. - * - * In the unlikely event that effective_cpus - * becomes empty. we clear cp->nr_subparts_cpus and - * let its child partition roots to compete for - * CPUs again. + * The cpuset partition_root_state may become + * invalid. Capture it. */ - cpumask_andnot(cp->effective_cpus, cp->effective_cpus, - cp->subparts_cpus); - if (cpumask_empty(cp->effective_cpus)) { - cpumask_copy(cp->effective_cpus, tmp->new_cpus); - cpumask_clear(cp->subparts_cpus); - cp->nr_subparts_cpus = 0; - } else if (!cpumask_subset(cp->subparts_cpus, - tmp->new_cpus)) { - cpumask_andnot(cp->subparts_cpus, - cp->subparts_cpus, tmp->new_cpus); - cp->nr_subparts_cpus - = cpumask_weight(cp->subparts_cpus); - } + new_prs = cp->partition_root_state; } - if (new_prs != old_prs) - cp->partition_root_state = new_prs; + spin_lock_irq(&callback_lock); + cpumask_copy(cp->effective_cpus, tmp->new_cpus); + cp->partition_root_state = new_prs; + if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) + compute_excpus(cp, cp->effective_xcpus); + /* + * Make sure effective_xcpus is properly set for a valid + * partition root. + */ + if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) + cpumask_and(cp->effective_xcpus, + cp->cpus_allowed, parent->effective_xcpus); + else if (new_prs < 0) + reset_partition_data(cp); spin_unlock_irq(&callback_lock); - notify_partition_change(cp, old_prs, new_prs); + + notify_partition_change(cp, old_prs); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp); + cpuset_update_tasks_cpumask(cp, cp->effective_cpus); + + /* + * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE + * from parent if current cpuset isn't a valid partition root + * and their load balance states differ. + */ + if (cpuset_v2() && !is_partition_valid(cp) && + (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { + if (is_sched_load_balance(parent)) + set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + } /* * On legacy hierarchy, if the effective cpumask of any non- @@ -1497,17 +2369,13 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) */ if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - is_partition_root(cp))) - need_rebuild_sched_domains = true; + (!cpuset_v2() || is_partition_valid(cp))) + cpuset_force_rebuild(); rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); - - if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); } /** @@ -1522,23 +2390,165 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; + lockdep_assert_held(&cpuset_mutex); + /* * Check all its siblings and call update_cpumasks_hier() - * if their use_parent_ecpus flag is set in order for them - * to use the right effective_cpus value. + * if their effective_cpus will need to be changed. + * + * It is possible a change in parent's effective_cpus + * due to a change in a child partition's effective_xcpus will impact + * its siblings even if they do not inherit parent's effective_cpus + * directly. + * + * The update_cpumasks_hier() function may sleep. So we have to + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs) continue; - if (!sibling->use_parent_ecpus) + if (!is_partition_valid(sibling)) { + compute_effective_cpumask(tmp->new_cpus, sibling, + parent); + if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) + continue; + } else if (is_remote_partition(sibling)) { + /* + * Change in a sibling cpuset won't affect a remote + * partition root. + */ continue; + } - update_cpumasks_hier(sibling, tmp); + if (!css_tryget_online(&sibling->css)) + continue; + + rcu_read_unlock(); + update_cpumasks_hier(sibling, tmp, false); + rcu_read_lock(); + css_put(&sibling->css); } rcu_read_unlock(); } +static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) +{ + int retval; + + retval = cpulist_parse(buf, out_mask); + if (retval < 0) + return retval; + if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed)) + return -EINVAL; + + return 0; +} + +/** + * validate_partition - Validate a cpuset partition configuration + * @cs: The cpuset to validate + * @trialcs: The trial cpuset containing proposed configuration changes + * + * If any validation check fails, the appropriate error code is set in the + * cpuset's prs_err field. + * + * Return: PRS error code (0 if valid, non-zero error code if invalid) + */ +static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) +{ + struct cpuset *parent = parent_cs(cs); + + if (cs_is_member(trialcs)) + return PERR_NONE; + + if (cpumask_empty(trialcs->effective_xcpus)) + return PERR_INVCPUS; + + if (prstate_housekeeping_conflict(trialcs->partition_root_state, + trialcs->effective_xcpus)) + return PERR_HKEEPING; + + if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) + return PERR_NOCPUS; + + return PERR_NONE; +} + +static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + int retval; + struct cpuset *parent = parent_cs(cs); + + retval = validate_change(cs, trialcs); + + if ((retval == -EINVAL) && cpuset_v2()) { + struct cgroup_subsys_state *css; + struct cpuset *cp; + + /* + * The -EINVAL error code indicates that partition sibling + * CPU exclusivity rule has been violated. We still allow + * the cpumask change to proceed while invalidating the + * partition. However, any conflicting sibling partitions + * have to be marked as invalid too. + */ + trialcs->prs_err = PERR_NOTEXCL; + rcu_read_lock(); + cpuset_for_each_child(cp, css, parent) { + struct cpumask *xcpus = user_xcpus(trialcs); + + if (is_partition_valid(cp) && + cpumask_intersects(xcpus, cp->effective_xcpus)) { + rcu_read_unlock(); + update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); + rcu_read_lock(); + } + } + rcu_read_unlock(); + retval = 0; + } + return retval; +} + +/** + * partition_cpus_change - Handle partition state changes due to CPU mask updates + * @cs: The target cpuset being modified + * @trialcs: The trial cpuset containing proposed configuration changes + * @tmp: Temporary masks for intermediate calculations + * + * This function handles partition state transitions triggered by CPU mask changes. + * CPU modifications may cause a partition to be disabled or require state updates. + */ +static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + enum prs_errcode prs_err; + + if (cs_is_member(cs)) + return; + + prs_err = validate_partition(cs, trialcs); + if (prs_err) + trialcs->prs_err = cs->prs_err = prs_err; + + if (is_remote_partition(cs)) { + if (trialcs->prs_err) + remote_partition_disable(cs, tmp); + else + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, tmp); + } else { + if (trialcs->prs_err) + update_parent_effective_cpumask(cs, partcmd_invalidate, + NULL, tmp); + else + update_parent_effective_cpumask(cs, partcmd_update, + trialcs->effective_xcpus, tmp); + } +} + /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider @@ -1550,81 +2560,120 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; + bool force = false; + int old_prs = cs->partition_root_state; - /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ - if (cs == &top_cpuset) - return -EACCES; - - /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. - * Since cpulist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have cpus. - */ - if (!*buf) { - cpumask_clear(trialcs->cpus_allowed); - } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); - if (retval < 0) - return retval; - - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) - return -EINVAL; - } + retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); + if (retval < 0) + return retval; /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - retval = validate_change(cs, trialcs); + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; + + compute_trialcs_excpus(trialcs, cs); + trialcs->prs_err = PERR_NONE; + + retval = cpus_allowed_validate_change(cs, trialcs, &tmp); if (retval < 0) - return retval; + goto out_free; -#ifdef CONFIG_CPUMASK_OFFSTACK /* - * Use the cpumasks in trialcs for tmpmasks when they are pointers - * to allocated cpumasks. + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. */ - tmp.addmask = trialcs->subparts_cpus; - tmp.delmask = trialcs->effective_cpus; - tmp.new_cpus = trialcs->cpus_allowed; -#endif + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); - if (cs->partition_root_state) { - /* Cpumask of a partition root cannot be empty */ - if (cpumask_empty(trialcs->cpus_allowed)) - return -EINVAL; - if (update_parent_subparts_cpumask(cs, partcmd_update, - trialcs->cpus_allowed, &tmp) < 0) - return -EINVAL; - } + partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); + if ((old_prs > 0) && !is_partition_valid(cs)) + reset_partition_data(cs); + spin_unlock_irq(&callback_lock); + + /* effective_cpus/effective_xcpus will be updated here */ + update_cpumasks_hier(cs, &tmp, force); + + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ + if (cs->partition_root_state) + update_partition_sd_lb(cs, old_prs); +out_free: + free_tmpmasks(&tmp); + return retval; +} + +/** + * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + * + * The tasks' cpumask will be updated if cs is a valid partition root. + */ +static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + struct tmpmasks tmp; + bool force = false; + int old_prs = cs->partition_root_state; + + retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus); + if (retval < 0) + return retval; + + /* Nothing to do if the CPUs didn't change */ + if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) + return 0; /* - * Make sure that subparts_cpus is a subset of cpus_allowed. + * Reject the change if there is exclusive CPUs conflict with + * the siblings. */ - if (cs->nr_subparts_cpus) { - cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); - cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); - } + if (compute_trialcs_excpus(trialcs, cs)) + return -EINVAL; + + /* + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. + */ + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + + retval = validate_change(cs, trialcs); + if (retval) + return retval; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; + + trialcs->prs_err = PERR_NONE; + partition_cpus_change(cs, trialcs, &tmp); + + spin_lock_irq(&callback_lock); + cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); + cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); + if ((old_prs > 0) && !is_partition_valid(cs)) + reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_cpumasks_hier(cs, &tmp); + /* + * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus + * of the subtree when it is a valid partition root or effective_xcpus + * is updated. + */ + if (is_partition_valid(cs) || force) + update_cpumasks_hier(cs, &tmp, force); - if (cs->partition_root_state) { - struct cpuset *parent = parent_cs(cs); + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ + if (cs->partition_root_state) + update_partition_sd_lb(cs, old_prs); - /* - * For partition root, update the cpumasks of sibling - * cpusets if they use parent's effective_cpus. - */ - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmp); - } + free_tmpmasks(&tmp); return 0; } @@ -1676,9 +2725,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -static void cpuset_post_attach(void) +static void flush_migrate_mm_task_workfn(struct callback_head *head) { flush_workqueue(cpuset_migrate_mm_wq); + kfree(head); +} + +static void schedule_flush_migrate_mm(void) +{ + struct callback_head *flush_cb; + + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); + if (!flush_cb) + return; + + init_task_work(flush_cb, flush_migrate_mm_task_workfn); + + if (task_work_add(current, flush_cb, TWA_RESUME)) + kfree(flush_cb); } /* @@ -1712,16 +2776,16 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, static void *cpuset_being_rebound; /** - * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. + * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the - * effective cpuset's. As this function is called with cpuset_rwsem held, + * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ -static void update_tasks_nodemask(struct cpuset *cs) +void cpuset_update_tasks_nodemask(struct cpuset *cs) { - static nodemask_t newmems; /* protected by cpuset_rwsem */ + static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; struct task_struct *task; @@ -1734,7 +2798,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cpuset_rwsem, we know that no other rebind effort + * the global cpuset_mutex, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1780,7 +2844,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * - * Called with cpuset_rwsem held + * Called with cpuset_mutex held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { @@ -1817,7 +2881,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); - update_tasks_nodemask(cp); + cpuset_update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); @@ -1833,7 +2897,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_rwsem held. May take callback_lock during call. + * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1844,41 +2908,24 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, int retval; /* - * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; - * it's read-only - */ - if (cs == &top_cpuset) { - retval = -EACCES; - goto done; - } - - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. - * Since nodelist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have memory. + * The validate_change() call ensures that cpusets with tasks have memory. */ - if (!*buf) { - nodes_clear(trialcs->mems_allowed); - } else { - retval = nodelist_parse(buf, trialcs->mems_allowed); - if (retval < 0) - goto done; + retval = nodelist_parse(buf, trialcs->mems_allowed); + if (retval < 0) + return retval; - if (!nodes_subset(trialcs->mems_allowed, - top_cpuset.mems_allowed)) { - retval = -EINVAL; - goto done; - } - } + if (!nodes_subset(trialcs->mems_allowed, + top_cpuset.mems_allowed)) + return -EINVAL; + + /* No change? nothing to do */ + if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) + return 0; - if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { - retval = 0; /* Too easy - nothing to do */ - goto done; - } retval = validate_change(cs, trialcs); if (retval < 0) - goto done; + return retval; check_insane_mems_config(&trialcs->mems_allowed); @@ -1888,8 +2935,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &trialcs->mems_allowed); -done: - return retval; + return 0; } bool current_cpuset_is_being_rebound(void) @@ -1903,52 +2949,16 @@ bool current_cpuset_is_being_rebound(void) return ret; } -static int update_relax_domain_level(struct cpuset *cs, s64 val) -{ -#ifdef CONFIG_SMP - if (val < -1 || val >= sched_domain_level_max) - return -EINVAL; -#endif - - if (val != cs->relax_domain_level) { - cs->relax_domain_level = val; - if (!cpumask_empty(cs->cpus_allowed) && - is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); - } - - return 0; -} - -/** - * update_tasks_flags - update the spread flags of tasks in the cpuset. - * @cs: the cpuset in which each task's spread flags needs to be changed - * - * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_rwsem held, cpuset membership stays - * stable. - */ -static void update_tasks_flags(struct cpuset *cs) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) - cpuset_update_task_spread_flag(cs, task); - css_task_iter_end(&it); -} - /* - * update_flag - read a 0 or a 1 in a file and update associated flag + * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cpuset_rwsem held. + * Call with cpuset_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { struct cpuset *trialcs; @@ -1956,7 +2966,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int spread_flag_changed; int err; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) return -ENOMEM; @@ -1979,303 +2989,340 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); - if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { + if (cpuset_v2()) + cpuset_force_rebuild(); + else + rebuild_sched_domains_locked(); + } if (spread_flag_changed) - update_tasks_flags(cs); + cpuset1_update_tasks_flags(cs); out: free_cpuset(trialcs); return err; } -/* - * update_prstate - update partititon_root_state - * cs: the cpuset to update - * new_prs: new partition root state +/** + * update_prstate - update partition_root_state + * @cs: the cpuset to update + * @new_prs: new partition root state + * Return: 0 if successful, != 0 if error * - * Call with cpuset_rwsem held. + * Call with cpuset_mutex held. */ static int update_prstate(struct cpuset *cs, int new_prs) { - int err, old_prs = cs->partition_root_state; + int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; + bool isolcpus_updated = false; if (old_prs == new_prs) return 0; /* - * Cannot force a partial or invalid partition root to a full - * partition root. + * Treat a previously invalid partition root as if it is a "member". */ - if (new_prs && (old_prs == PRS_ERROR)) - return -EINVAL; + if (new_prs && is_partition_invalid(cs)) + old_prs = PRS_MEMBER; - if (alloc_cpumasks(NULL, &tmpmask)) + if (alloc_tmpmasks(&tmpmask)) return -ENOMEM; - err = -EINVAL; + err = update_partition_exclusive_flag(cs, new_prs); + if (err) + goto out; + if (!old_prs) { /* - * Turning on partition root requires setting the - * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed - * cannot be NULL. + * cpus_allowed and exclusive_cpus cannot be both empty. */ - if (cpumask_empty(cs->cpus_allowed)) + if (xcpus_empty(cs)) { + err = PERR_CPUSEMPTY; goto out; + } - err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); - if (err) + /* + * We don't support the creation of a new local partition with + * a remote partition underneath it. This unsupported + * setting can happen only if parent is the top_cpuset because + * a remote partition cannot be created underneath an existing + * local or remote partition. + */ + if ((parent == &top_cpuset) && + cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) { + err = PERR_REMOTE; goto out; + } - err = update_parent_subparts_cpumask(cs, partcmd_enable, - NULL, &tmpmask); - if (err) { - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - goto out; + /* + * If parent is valid partition, enable local partiion. + * Otherwise, enable a remote partition. + */ + if (is_partition_valid(parent)) { + enum partition_cmd cmd = (new_prs == PRS_ROOT) + ? partcmd_enable : partcmd_enablei; + + err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); + } else { + err = remote_partition_enable(cs, new_prs, &tmpmask); } + } else if (old_prs && new_prs) { + /* + * A change in load balance state only, no change in cpumasks. + * Need to update isolated_cpus. + */ + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(cs->effective_xcpus, NULL)) || + prstate_housekeeping_conflict(new_prs, cs->effective_xcpus)) + err = PERR_HKEEPING; + else + isolcpus_updated = true; } else { /* - * Turning off partition root will clear the - * CS_CPU_EXCLUSIVE bit. + * Switching back to member is always allowed even if it + * disables child partitions. */ - if (old_prs == PRS_ERROR) { - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - err = 0; - goto out; - } - - err = update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, &tmpmask); - if (err) - goto out; + if (is_remote_partition(cs)) + remote_partition_disable(cs, &tmpmask); + else + update_parent_effective_cpumask(cs, partcmd_disable, + NULL, &tmpmask); - /* Turning off CS_CPU_EXCLUSIVE will not return error */ - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + /* + * Invalidation of child partitions will be done in + * update_cpumasks_hier(). + */ } - +out: /* - * Update cpumask of parent's tasks except when it is the top - * cpuset as some system daemons cannot be mapped to other CPUs. + * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error + * happens. */ - if (parent != &top_cpuset) - update_tasks_cpumask(parent); - - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmpmask); - - rebuild_sched_domains_locked(); -out: - if (!err) { - spin_lock_irq(&callback_lock); - cs->partition_root_state = new_prs; - spin_unlock_irq(&callback_lock); - notify_partition_change(cs, old_prs, new_prs); + if (err) { + new_prs = -new_prs; + update_partition_exclusive_flag(cs, new_prs); } - free_cpumasks(NULL, &tmpmask); - return err; -} + spin_lock_irq(&callback_lock); + cs->partition_root_state = new_prs; + WRITE_ONCE(cs->prs_err, err); + if (!is_partition_valid(cs)) + reset_partition_data(cs); + else if (isolcpus_updated) + isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); + spin_unlock_irq(&callback_lock); + update_isolation_cpumasks(); -/* - * Frequency meter - How fast is some event occurring? - * - * These routines manage a digitally filtered, constant time based, - * event frequency meter. There are four routines: - * fmeter_init() - initialize a frequency meter. - * fmeter_markevent() - called each time the event happens. - * fmeter_getrate() - returns the recent rate of such events. - * fmeter_update() - internal routine used to update fmeter. - * - * A common data structure is passed to each of these routines, - * which is used to keep track of the state required to manage the - * frequency meter and its digital filter. - * - * The filter works on the number of events marked per unit time. - * The filter is single-pole low-pass recursive (IIR). The time unit - * is 1 second. Arithmetic is done using 32-bit integers scaled to - * simulate 3 decimal digits of precision (multiplied by 1000). - * - * With an FM_COEF of 933, and a time base of 1 second, the filter - * has a half-life of 10 seconds, meaning that if the events quit - * happening, then the rate returned from the fmeter_getrate() - * will be cut in half each 10 seconds, until it converges to zero. - * - * It is not worth doing a real infinitely recursive filter. If more - * than FM_MAXTICKS ticks have elapsed since the last filter event, - * just compute FM_MAXTICKS ticks worth, by which point the level - * will be stable. - * - * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid - * arithmetic overflow in the fmeter_update() routine. - * - * Given the simple 32 bit integer arithmetic used, this meter works - * best for reporting rates between one per millisecond (msec) and - * one per 32 (approx) seconds. At constant rates faster than one - * per msec it maxes out at values just under 1,000,000. At constant - * rates between one per msec, and one per second it will stabilize - * to a value N*1000, where N is the rate of events per second. - * At constant rates between one per second and one per 32 seconds, - * it will be choppy, moving up on the seconds that have an event, - * and then decaying until the next event. At rates slower than - * about one in 32 seconds, it decays all the way back to zero between - * each event. - */ - -#define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ -#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ -#define FM_SCALE 1000 /* faux fixed point scale */ - -/* Initialize a frequency meter */ -static void fmeter_init(struct fmeter *fmp) -{ - fmp->cnt = 0; - fmp->val = 0; - fmp->time = 0; - spin_lock_init(&fmp->lock); -} - -/* Internal meter update - process cnt events and update value */ -static void fmeter_update(struct fmeter *fmp) -{ - time64_t now; - u32 ticks; - - now = ktime_get_seconds(); - ticks = now - fmp->time; - - if (ticks == 0) - return; + /* Force update if switching back to member & update effective_xcpus */ + update_cpumasks_hier(cs, &tmpmask, !new_prs); + + /* A newly created partition must have effective_xcpus set */ + WARN_ON_ONCE(!old_prs && (new_prs > 0) + && cpumask_empty(cs->effective_xcpus)); - ticks = min(FM_MAXTICKS, ticks); - while (ticks-- > 0) - fmp->val = (FM_COEF * fmp->val) / FM_SCALE; - fmp->time = now; + /* Update sched domains and load balance flag */ + update_partition_sd_lb(cs, old_prs); - fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; - fmp->cnt = 0; + notify_partition_change(cs, old_prs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); + free_tmpmasks(&tmpmask); + return 0; } -/* Process any previous ticks, then bump cnt by one (times scale). */ -static void fmeter_markevent(struct fmeter *fmp) +static struct cpuset *cpuset_attach_old_cs; + +/* + * Check to see if a cpuset can accept a new task + * For v1, cpus_allowed and mems_allowed can't be empty. + * For v2, effective_cpus can't be empty. + * Note that in v1, effective_cpus = cpus_allowed. + */ +static int cpuset_can_attach_check(struct cpuset *cs) { - spin_lock(&fmp->lock); - fmeter_update(fmp); - fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); - spin_unlock(&fmp->lock); + if (cpumask_empty(cs->effective_cpus) || + (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) + return -ENOSPC; + return 0; } -/* Process any previous ticks, then return current value. */ -static int fmeter_getrate(struct fmeter *fmp) +static void reset_migrate_dl_data(struct cpuset *cs) { - int val; - - spin_lock(&fmp->lock); - fmeter_update(fmp); - val = fmp->val; - spin_unlock(&fmp->lock); - return val; + cs->nr_migrate_dl_tasks = 0; + cs->sum_migrate_dl_bw = 0; } -static struct cpuset *cpuset_attach_old_cs; - -/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; - struct cpuset *cs; + struct cpuset *cs, *oldcs; struct task_struct *task; + bool cpus_updated, mems_updated; int ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + oldcs = cpuset_attach_old_cs; cs = css_cs(css); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); - /* allow moving tasks into an empty cpuset if on default hierarchy */ - ret = -ENOSPC; - if (!is_in_v2_mode() && - (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) goto out_unlock; + cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + cgroup_taskset_for_each(task, css, tset) { - ret = task_can_attach(task, cs->cpus_allowed); + ret = task_can_attach(task); if (ret) goto out_unlock; - ret = security_task_setscheduler(task); - if (ret) + + /* + * Skip rights over task check in v2 when nothing changes, + * migration permission derives from hierarchy ownership in + * cgroup_procs_write_permission()). + */ + if (!cpuset_v2() || (cpus_updated || mems_updated)) { + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + } + + if (dl_task(task)) { + cs->nr_migrate_dl_tasks++; + cs->sum_migrate_dl_bw += task->dl.dl_bw; + } + } + + if (!cs->nr_migrate_dl_tasks) + goto out_success; + + if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { + int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + + if (unlikely(cpu >= nr_cpu_ids)) { + reset_migrate_dl_data(cs); + ret = -EINVAL; + goto out_unlock; + } + + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) { + reset_migrate_dl_data(cs); goto out_unlock; + } } +out_success: /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; - ret = 0; out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); return ret; } static void cpuset_cancel_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; + struct cpuset *cs; cgroup_taskset_first(tset, &css); + cs = css_cs(css); + + mutex_lock(&cpuset_mutex); + dec_attach_in_progress_locked(cs); - percpu_down_write(&cpuset_rwsem); - css_cs(css)->attach_in_progress--; - percpu_up_write(&cpuset_rwsem); + if (cs->nr_migrate_dl_tasks) { + int cpu = cpumask_any(cs->effective_cpus); + + dl_bw_free(cpu, cs->sum_migrate_dl_bw); + reset_migrate_dl_data(cs); + } + + mutex_unlock(&cpuset_mutex); } /* - * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_to; + +static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) +{ + lockdep_assert_held(&cpuset_mutex); + + if (cs != &top_cpuset) + guarantee_active_cpus(task, cpus_attach); + else + cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), + subpartitions_cpus); + /* + * can_attach beforehand should guarantee that this doesn't + * fail. TODO: have a better way to handle failure here + */ + WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + + cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); + cpuset1_update_task_spread_flags(cs, task); +} static void cpuset_attach(struct cgroup_taskset *tset) { - /* static buf protected by cpuset_rwsem */ - static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; struct cgroup_subsys_state *css; struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + bool cpus_updated, mems_updated; + bool queue_task_work = false; cgroup_taskset_first(tset, &css); cs = css_cs(css); - percpu_down_write(&cpuset_rwsem); + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ + mutex_lock(&cpuset_mutex); + cpus_updated = !cpumask_equal(cs->effective_cpus, + oldcs->effective_cpus); + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); - guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + /* + * In the default hierarchy, enabling cpuset in the child cgroups + * will trigger a number of cpuset_attach() calls with no change + * in effective cpus and mems. In that case, we can optimize out + * by skipping the task iteration and update. + */ + if (cpuset_v2() && !cpus_updated && !mems_updated) { + cpuset_attach_nodemask_to = cs->effective_mems; + goto out; + } - cgroup_taskset_for_each(task, css, tset) { - if (cs != &top_cpuset) - guarantee_online_cpus(task, cpus_attach); - else - cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); - /* - * can_attach beforehand should guarantee that this doesn't - * fail. TODO: have a better way to handle failure here - */ - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flag(cs, task); - } + cgroup_taskset_for_each(task, css, tset) + cpuset_attach_task(cs, task); /* * Change mm for all threadgroup leaders. This is expensive and may - * sleep and should be moved outside migration path proper. + * sleep and should be moved outside migration path proper. Skip it + * if there is no change in effective_mems and CS_MEMORY_MIGRATE is + * not set. */ cpuset_attach_nodemask_to = cs->effective_mems; + if (!is_memory_migrate(cs) && !mems_updated) + goto out; + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); @@ -2290,160 +3337,51 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) + if (is_memory_migrate(cs)) { cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - else + queue_task_work = true; + } else mmput(mm); } } +out: + if (queue_task_work) + schedule_flush_migrate_mm(); cs->old_mems_allowed = cpuset_attach_nodemask_to; - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - - percpu_up_write(&cpuset_rwsem); -} - -/* The various types of files and directories in a cpuset file system */ - -typedef enum { - FILE_MEMORY_MIGRATE, - FILE_CPULIST, - FILE_MEMLIST, - FILE_EFFECTIVE_CPULIST, - FILE_EFFECTIVE_MEMLIST, - FILE_SUBPARTS_CPULIST, - FILE_CPU_EXCLUSIVE, - FILE_MEM_EXCLUSIVE, - FILE_MEM_HARDWALL, - FILE_SCHED_LOAD_BALANCE, - FILE_PARTITION_ROOT, - FILE_SCHED_RELAX_DOMAIN_LEVEL, - FILE_MEMORY_PRESSURE_ENABLED, - FILE_MEMORY_PRESSURE, - FILE_SPREAD_PAGE, - FILE_SPREAD_SLAB, -} cpuset_filetype_t; - -static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = 0; - - cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); - if (!is_cpuset_online(cs)) { - retval = -ENODEV; - goto out_unlock; - } - - switch (type) { - case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); - break; - case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); - break; - case FILE_MEM_HARDWALL: - retval = update_flag(CS_MEM_HARDWALL, cs, val); - break; - case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); - break; - case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, val); - break; - case FILE_MEMORY_PRESSURE_ENABLED: - cpuset_memory_pressure_enabled = !!val; - break; - case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, val); - break; - case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, val); - break; - default: - retval = -EINVAL; - break; + if (cs->nr_migrate_dl_tasks) { + cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; + oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; + reset_migrate_dl_data(cs); } -out_unlock: - percpu_up_write(&cpuset_rwsem); - cpus_read_unlock(); - return retval; -} -static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, - s64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = -ENODEV; - - cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); - if (!is_cpuset_online(cs)) - goto out_unlock; + dec_attach_in_progress_locked(cs); - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - retval = update_relax_domain_level(cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: - percpu_up_write(&cpuset_rwsem); - cpus_read_unlock(); - return retval; + mutex_unlock(&cpuset_mutex); } /* * Common handling for a write to a "cpus" or "mems" file. */ -static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); struct cpuset *trialcs; int retval = -ENODEV; - buf = strstrip(buf); - - /* - * CPU or memory hotunplug may leave @cs w/o any execution - * resources, in which case the hotplug code asynchronously updates - * configuration and transfers all tasks to the nearest ancestor - * which can execute. - * - * As writes to "cpus" or "mems" may restore @cs's execution - * resources, wait for the previously scheduled operations before - * proceeding, so that we don't end up keep removing tasks added - * after execution capability is restored. - * - * cpuset_hotplug_work calls back into cgroup core via - * cgroup_transfer_tasks() and waiting for it from a cgroupfs - * operation like this one can lead to a deadlock through kernfs - * active_ref protection. Let's break the protection. Losing the - * protection is okay as we check whether @cs is online after - * grabbing cpuset_rwsem anyway. This only happens on the legacy - * hierarchies. - */ - css_get(&cs->css); - kernfs_break_active_protection(of->kn); - flush_work(&cpuset_hotplug_work); + /* root is read-only */ + if (cs == &top_cpuset) + return -EACCES; - cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + buf = strstrip(buf); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) { retval = -ENOMEM; goto out_unlock; @@ -2453,6 +3391,9 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_CPULIST: retval = update_cpumask(cs, trialcs, buf); break; + case FILE_EXCLUSIVE_CPULIST: + retval = update_exclusive_cpumask(cs, trialcs, buf); + break; case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; @@ -2462,12 +3403,12 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, } free_cpuset(trialcs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); out_unlock: - percpu_up_write(&cpuset_rwsem); - cpus_read_unlock(); - kernfs_unbreak_active_protection(of->kn); - css_put(&cs->css); - flush_workqueue(cpuset_migrate_mm_wq); + cpuset_full_unlock(); + if (of_cft(of)->private == FILE_MEMLIST) + schedule_flush_migrate_mm(); return retval ?: nbytes; } @@ -2479,7 +3420,7 @@ out_unlock: * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. */ -static int cpuset_common_seq_show(struct seq_file *sf, void *v) +int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; @@ -2500,8 +3441,17 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; + case FILE_EXCLUSIVE_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); + break; + case FILE_EFFECTIVE_XCPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); + break; case FILE_SUBPARTS_CPULIST: - seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); + break; + case FILE_ISOLATED_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); break; default: ret = -EINVAL; @@ -2511,71 +3461,38 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) return ret; } -static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_CPU_EXCLUSIVE: - return is_cpu_exclusive(cs); - case FILE_MEM_EXCLUSIVE: - return is_mem_exclusive(cs); - case FILE_MEM_HARDWALL: - return is_mem_hardwall(cs); - case FILE_SCHED_LOAD_BALANCE: - return is_sched_load_balance(cs); - case FILE_MEMORY_MIGRATE: - return is_memory_migrate(cs); - case FILE_MEMORY_PRESSURE_ENABLED: - return cpuset_memory_pressure_enabled; - case FILE_MEMORY_PRESSURE: - return fmeter_getrate(&cs->fmeter); - case FILE_SPREAD_PAGE: - return is_spread_page(cs); - case FILE_SPREAD_SLAB: - return is_spread_slab(cs); - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - -static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - return cs->relax_domain_level; - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - -static int sched_partition_show(struct seq_file *seq, void *v) +static int cpuset_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); + const char *err, *type = NULL; switch (cs->partition_root_state) { - case PRS_ENABLED: + case PRS_ROOT: seq_puts(seq, "root\n"); break; - case PRS_DISABLED: + case PRS_ISOLATED: + seq_puts(seq, "isolated\n"); + break; + case PRS_MEMBER: seq_puts(seq, "member\n"); break; - case PRS_ERROR: - seq_puts(seq, "root invalid\n"); + case PRS_INVALID_ROOT: + type = "root"; + fallthrough; + case PRS_INVALID_ISOLATED: + if (!type) + type = "isolated"; + err = perr_strings[READ_ONCE(cs->prs_err)]; + if (err) + seq_printf(seq, "%s invalid (%s)\n", type, err); + else + seq_printf(seq, "%s invalid\n", type); break; } return 0; } -static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, +static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); @@ -2584,41 +3501,34 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, buf = strstrip(buf); - /* - * Convert "root" to ENABLED, and convert "member" to DISABLED. - */ if (!strcmp(buf, "root")) - val = PRS_ENABLED; + val = PRS_ROOT; else if (!strcmp(buf, "member")) - val = PRS_DISABLED; + val = PRS_MEMBER; + else if (!strcmp(buf, "isolated")) + val = PRS_ISOLATED; else return -EINVAL; - css_get(&cs->css); - cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); - if (!is_cpuset_online(cs)) - goto out_unlock; - - retval = update_prstate(cs, val); -out_unlock: - percpu_up_write(&cpuset_rwsem); - cpus_read_unlock(); - css_put(&cs->css); + cpuset_full_lock(); + if (is_cpuset_online(cs)) + retval = update_prstate(cs, val); + cpuset_full_unlock(); return retval ?: nbytes; } /* - * for the common functions, 'private' gives the type of file + * This is currently a minimal set for the default hierarchy. It can be + * expanded later on by migrating more features and control files from v1. */ - -static struct cftype legacy_files[] = { +static struct cftype dfl_files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, + .flags = CFTYPE_NOT_ON_ROOT, }, { @@ -2627,153 +3537,73 @@ static struct cftype legacy_files[] = { .write = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, + .flags = CFTYPE_NOT_ON_ROOT, }, { - .name = "effective_cpus", + .name = "cpus.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_CPULIST, }, { - .name = "effective_mems", + .name = "mems.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_MEMLIST, }, { - .name = "cpu_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_CPU_EXCLUSIVE, - }, - - { - .name = "mem_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_EXCLUSIVE, - }, - - { - .name = "mem_hardwall", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_HARDWALL, - }, - - { - .name = "sched_load_balance", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SCHED_LOAD_BALANCE, - }, - - { - .name = "sched_relax_domain_level", - .read_s64 = cpuset_read_s64, - .write_s64 = cpuset_write_s64, - .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, - }, - - { - .name = "memory_migrate", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_MIGRATE, - }, - - { - .name = "memory_pressure", - .read_u64 = cpuset_read_u64, - .private = FILE_MEMORY_PRESSURE, - }, - - { - .name = "memory_spread_page", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_PAGE, - }, - - { - .name = "memory_spread_slab", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_SLAB, - }, - - { - .name = "memory_pressure_enabled", - .flags = CFTYPE_ONLY_ON_ROOT, - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE_ENABLED, + .name = "cpus.partition", + .seq_show = cpuset_partition_show, + .write = cpuset_partition_write, + .private = FILE_PARTITION_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct cpuset, partition_file), }, - { } /* terminate */ -}; - -/* - * This is currently a minimal set for the default hierarchy. It can be - * expanded later on by migrating more features and control files from v1. - */ -static struct cftype dfl_files[] = { { - .name = "cpus", + .name = "cpus.exclusive", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), - .private = FILE_CPULIST, + .private = FILE_EXCLUSIVE_CPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { - .name = "mems", + .name = "cpus.exclusive.effective", .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * MAX_NUMNODES), - .private = FILE_MEMLIST, + .private = FILE_EFFECTIVE_XCPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { - .name = "cpus.effective", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_CPULIST, - }, - - { - .name = "mems.effective", + .name = "cpus.subpartitions", .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_MEMLIST, - }, - - { - .name = "cpus.partition", - .seq_show = sched_partition_show, - .write = sched_partition_write, - .private = FILE_PARTITION_ROOT, - .flags = CFTYPE_NOT_ON_ROOT, - .file_offset = offsetof(struct cpuset, partition_file), + .private = FILE_SUBPARTS_CPULIST, + .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, }, { - .name = "cpus.subpartitions", + .name = "cpus.isolated", .seq_show = cpuset_common_seq_show, - .private = FILE_SUBPARTS_CPULIST, - .flags = CFTYPE_DEBUG, + .private = FILE_ISOLATED_CPULIST, + .flags = CFTYPE_ONLY_ON_ROOT, }, { } /* terminate */ }; -/* - * cpuset_css_alloc - allocate a cpuset css - * cgrp: control group that the new cpuset will be part of +/** + * cpuset_css_alloc - Allocate a cpuset css + * @parent_css: Parent css of the control group that the new cpuset will be + * part of + * Return: cpuset css on success, -ENOMEM on failure. + * + * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return + * top cpuset css otherwise. */ - static struct cgroup_subsys_state * cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -2782,23 +3612,16 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) return &top_cpuset.css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); + cs = dup_or_alloc_cpuset(NULL); if (!cs) return ERR_PTR(-ENOMEM); - if (alloc_cpumasks(cs, NULL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } - __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - nodes_clear(cs->mems_allowed); - nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; /* Set CS_MEMORY_MIGRATE for default hierarchy */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (cpuset_v2()) __set_bit(CS_MEMORY_MIGRATE, &cs->flags); return &cs->css; @@ -2814,14 +3637,16 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); - - set_bit(CS_ONLINE, &cs->flags); + cpuset_full_lock(); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); + /* + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (cpuset_v2() && !is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); @@ -2829,8 +3654,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; - cs->use_parent_ecpus = true; - parent->child_ecpus_count++; } spin_unlock_irq(&callback_lock); @@ -2840,7 +3663,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is * set. This flag handling is implemented in cgroup core for - * histrical reasons - the flag may be specified during mount. + * historical reasons - the flag may be specified during mount. * * Currently, if any sibling cpusets have exclusive cpus or mem, we * refuse to clone the configuration - thereby refusing the task to @@ -2866,8 +3689,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - percpu_up_write(&cpuset_rwsem); - cpus_read_unlock(); + cpuset_full_unlock(); return 0; } @@ -2877,37 +3699,33 @@ out_unlock: * will call rebuild_sched_domains_locked(). That is not needed * in the default hierarchy where only changes in partition * will cause repartitioning. - * - * If the cpuset has the 'sched.partition' flag enabled, simulate - * turning 'sched.partition" off. */ - static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); - - if (is_partition_root(cs)) - update_prstate(cs, 0); - - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); - - cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } + cpuset_full_lock(); + if (!cpuset_v2() && is_sched_load_balance(cs)) + cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); - clear_bit(CS_ONLINE, &cs->flags); + cpuset_full_unlock(); +} - percpu_up_write(&cpuset_rwsem); - cpus_read_unlock(); +/* + * If a dying cpuset has the 'cpus.partition' enabled, turn it off by + * changing it back to member to free its exclusive CPUs back to the pool to + * be used by other online cpusets. + */ +static void cpuset_css_killed(struct cgroup_subsys_state *css) +{ + struct cpuset *cs = css_cs(css); + + cpuset_full_lock(); + /* Reset valid partition back to member */ + if (is_partition_valid(cs)) + update_prstate(cs, PRS_MEMBER); + cpuset_full_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -2919,11 +3737,12 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); + cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { cpumask_copy(top_cpuset.cpus_allowed, @@ -2932,7 +3751,65 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) } spin_unlock_irq(&callback_lock); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); +} + +/* + * In case the child is cloned into a cpuset different from its parent, + * additional checks are done to see if the move is allowed. + */ +static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + int ret; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return 0; + + lockdep_assert_held(&cgroup_mutex); + mutex_lock(&cpuset_mutex); + + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) + goto out_unlock; + + ret = task_can_attach(task); + if (ret) + goto out_unlock; + + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. + */ + cs->attach_in_progress++; +out_unlock: + mutex_unlock(&cpuset_mutex); + return ret; +} + +static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return; + + dec_attach_in_progress(cs); } /* @@ -2942,25 +3819,48 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) */ static void cpuset_fork(struct task_struct *task) { - if (task_css_is_root(task, cpuset_cgrp_id)) + struct cpuset *cs; + bool same_cs; + + rcu_read_lock(); + cs = task_cs(task); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) { + if (cs == &top_cpuset) + return; + + set_cpus_allowed_ptr(task, current->cpus_ptr); + task->mems_allowed = current->mems_allowed; return; + } - set_cpus_allowed_ptr(task, current->cpus_ptr); - task->mems_allowed = current->mems_allowed; + /* CLONE_INTO_CGROUP */ + mutex_lock(&cpuset_mutex); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + cpuset_attach_task(cs, task); + + dec_attach_in_progress_locked(cs); + mutex_unlock(&cpuset_mutex); } struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, + .css_killed = cpuset_css_killed, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .can_fork = cpuset_can_fork, + .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, - .legacy_cftypes = legacy_files, +#ifdef CONFIG_CPUSETS_V1 + .legacy_cftypes = cpuset1_files, +#endif .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, @@ -2974,90 +3874,32 @@ struct cgroup_subsys cpuset_cgrp_subsys = { int __init cpuset_init(void) { - BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); - BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); + cpumask_setall(top_cpuset.effective_xcpus); + cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); - top_cpuset.relax_domain_level = -1; BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); - return 0; -} - -/* - * If CPU and/or memory hotplug handlers, below, unplug any CPUs - * or memory nodes, we need to walk over the cpuset hierarchy, - * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then move the tasks in the empty - * cpuset to its next-highest non-empty parent. - */ -static void remove_tasks_in_empty_cpuset(struct cpuset *cs) -{ - struct cpuset *parent; - - /* - * Find its next-highest non-empty parent, (top cpuset - * has online cpus, so can't be empty). - */ - parent = parent_cs(cs); - while (cpumask_empty(parent->cpus_allowed) || - nodes_empty(parent->mems_allowed)) - parent = parent_cs(parent); - - if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - pr_err("cpuset: failed to transfer tasks out of empty cpuset "); - pr_cont_cgroup_name(cs->css.cgroup); - pr_cont("\n"); + have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); + if (have_boot_isolcpus) { + BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); + cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); } -} -static void -hotplug_update_tasks_legacy(struct cpuset *cs, - struct cpumask *new_cpus, nodemask_t *new_mems, - bool cpus_updated, bool mems_updated) -{ - bool is_empty; - - spin_lock_irq(&callback_lock); - cpumask_copy(cs->cpus_allowed, new_cpus); - cpumask_copy(cs->effective_cpus, new_cpus); - cs->mems_allowed = *new_mems; - cs->effective_mems = *new_mems; - spin_unlock_irq(&callback_lock); - - /* - * Don't call update_tasks_cpumask() if the cpuset becomes empty, - * as the tasks will be migratecd to an ancestor. - */ - if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs); - if (mems_updated && !nodes_empty(cs->mems_allowed)) - update_tasks_nodemask(cs); - - is_empty = cpumask_empty(cs->cpus_allowed) || - nodes_empty(cs->mems_allowed); - - percpu_up_write(&cpuset_rwsem); - - /* - * Move tasks to the nearest ancestor with execution resources, - * This is full cgroup operation which will also call back into - * cpuset. Should be done outside any lock. - */ - if (is_empty) - remove_tasks_in_empty_cpuset(cs); - - percpu_down_write(&cpuset_rwsem); + return 0; } static void @@ -3065,7 +3907,8 @@ hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { - if (cpumask_empty(new_cpus)) + /* A partition root is allowed to have empty effective cpus */ + if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; @@ -3076,16 +3919,14 @@ hotplug_update_tasks(struct cpuset *cs, spin_unlock_irq(&callback_lock); if (cpus_updated) - update_tasks_cpumask(cs); + cpuset_update_tasks_cpumask(cs, new_cpus); if (mems_updated) - update_tasks_nodemask(cs); + cpuset_update_tasks_nodemask(cs); } -static bool force_rebuild; - void cpuset_force_rebuild(void) { - force_rebuild = true; + force_sd_rebuild = true; } /** @@ -3103,18 +3944,20 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) static nodemask_t new_mems; bool cpus_updated; bool mems_updated; + bool remote; + int partcmd = -1; struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* * We have raced with task attaching. We wait until attaching * is finished, so we won't attach a task to an empty cpuset. */ if (cs->attach_in_progress) { - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); goto retry; } @@ -3122,68 +3965,56 @@ retry: compute_effective_cpumask(&new_cpus, cs, parent); nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); - if (cs->nr_subparts_cpus) - /* - * Make sure that CPUs allocated to child partitions - * do not show up in effective_cpus. - */ - cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus); - if (!tmp || !cs->partition_root_state) goto update_tasks; /* - * In the unlikely event that a partition root has empty - * effective_cpus or its parent becomes erroneous, we have to - * transition it to the erroneous state. + * Compute effective_cpus for valid partition root, may invalidate + * child partition roots if necessary. */ - if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || - (parent->partition_root_state == PRS_ERROR))) { - if (cs->nr_subparts_cpus) { - spin_lock_irq(&callback_lock); - cs->nr_subparts_cpus = 0; - cpumask_clear(cs->subparts_cpus); - spin_unlock_irq(&callback_lock); - compute_effective_cpumask(&new_cpus, cs, parent); - } - - /* - * If the effective_cpus is empty because the child - * partitions take away all the CPUs, we can keep - * the current partition and let the child partitions - * fight for available CPUs. - */ - if ((parent->partition_root_state == PRS_ERROR) || - cpumask_empty(&new_cpus)) { - int old_prs; - - update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, tmp); - old_prs = cs->partition_root_state; - if (old_prs != PRS_ERROR) { - spin_lock_irq(&callback_lock); - cs->partition_root_state = PRS_ERROR; - spin_unlock_irq(&callback_lock); - notify_partition_change(cs, old_prs, PRS_ERROR); - } - } - cpuset_force_rebuild(); + remote = is_remote_partition(cs); + if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) + compute_partition_effective_cpumask(cs, &new_cpus); + + if (remote && cpumask_empty(&new_cpus) && + partition_is_populated(cs, NULL)) { + cs->prs_err = PERR_HOTPLUG; + remote_partition_disable(cs, tmp); + compute_effective_cpumask(&new_cpus, cs, parent); + remote = false; } /* - * On the other hand, an erroneous partition root may be transitioned - * back to a regular one or a partition root with no CPU allocated - * from the parent may change to erroneous. + * Force the partition to become invalid if either one of + * the following conditions hold: + * 1) empty effective cpus but not valid empty partition. + * 2) parent is invalid or doesn't grant any cpus to child + * partitions. */ - if (is_partition_root(parent) && - ((cs->partition_root_state == PRS_ERROR) || - !cpumask_intersects(&new_cpus, parent->subparts_cpus)) && - update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp)) - cpuset_force_rebuild(); + if (is_local_partition(cs) && (!is_partition_valid(parent) || + tasks_nocpu_error(parent, cs, &new_cpus))) + partcmd = partcmd_invalidate; + /* + * On the other hand, an invalid partition root may be transitioned + * back to a regular one with a non-empty effective xcpus. + */ + else if (is_partition_valid(parent) && is_partition_invalid(cs) && + !cpumask_empty(cs->effective_xcpus)) + partcmd = partcmd_update; + + if (partcmd >= 0) { + update_parent_effective_cpumask(cs, partcmd, NULL, tmp); + if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { + compute_partition_effective_cpumask(cs, &new_cpus); + cpuset_force_rebuild(); + } + } update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); + if (!cpus_updated && !mems_updated) + goto unlock; /* Hotplug doesn't affect this cpuset */ if (mems_updated) check_insane_mems_config(&new_mems); @@ -3192,14 +4023,15 @@ update_tasks: hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else - hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); - percpu_up_write(&cpuset_rwsem); +unlock: + mutex_unlock(&cpuset_mutex); } /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -3213,8 +4045,10 @@ update_tasks: * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. + * + * CPU / memory hotplug is handled synchronously. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_handle_hotplug(void) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -3222,49 +4056,43 @@ static void cpuset_hotplug_workfn(struct work_struct *work) bool on_dfl = is_in_v2_mode(); struct tmpmasks tmp, *ptmp = NULL; - if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + if (on_dfl && !alloc_tmpmasks(&tmp)) ptmp = &tmp; - percpu_down_write(&cpuset_rwsem); + lockdep_assert_cpus_held(); + mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); new_mems = node_states[N_MEMORY]; /* - * If subparts_cpus is populated, it is likely that the check below - * will produce a false positive on cpus_updated when the cpu list - * isn't changed. It is extra work, but it is better to be safe. + * If subpartitions_cpus is populated, it is likely that the check + * below will produce a false positive on cpus_updated when the cpu + * list isn't changed. It is extra work, but it is better to be safe. */ - cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); + cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) || + !cpumask_empty(subpartitions_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); - /* - * In the rare case that hotplug removes all the cpus in subparts_cpus, - * we assumed that cpus are updated. - */ - if (!cpus_updated && top_cpuset.nr_subparts_cpus) - cpus_updated = true; - - /* synchronize cpus_allowed to cpu_active_mask */ + /* For v1, synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { + cpuset_force_rebuild(); spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); /* * Make sure that CPUs allocated to child partitions * do not show up in effective_cpus. If no CPU is left, - * we clear the subparts_cpus & let the child partitions + * we clear the subpartitions_cpus & let the child partitions * fight for the CPUs again. */ - if (top_cpuset.nr_subparts_cpus) { - if (cpumask_subset(&new_cpus, - top_cpuset.subparts_cpus)) { - top_cpuset.nr_subparts_cpus = 0; - cpumask_clear(top_cpuset.subparts_cpus); + if (!cpumask_empty(subpartitions_cpus)) { + if (cpumask_subset(&new_cpus, subpartitions_cpus)) { + cpumask_clear(subpartitions_cpus); } else { cpumask_andnot(&new_cpus, &new_cpus, - top_cpuset.subparts_cpus); + subpartitions_cpus); } } cpumask_copy(top_cpuset.effective_cpus, &new_cpus); @@ -3279,10 +4107,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; spin_unlock_irq(&callback_lock); - update_tasks_nodemask(&top_cpuset); + cpuset_update_tasks_nodemask(&top_cpuset); } - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { @@ -3303,13 +4131,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rcu_read_unlock(); } - /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; - rebuild_sched_domains(); - } + /* rebuild sched domains if necessary */ + if (force_sd_rebuild) + rebuild_sched_domains_cpuslocked(); - free_cpumasks(NULL, ptmp); + free_tmpmasks(ptmp); } void cpuset_update_active_cpus(void) @@ -3319,12 +4145,7 @@ void cpuset_update_active_cpus(void) * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); } /* @@ -3335,15 +4156,10 @@ void cpuset_wait_for_hotplug(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - schedule_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); return NOTIFY_OK; } -static struct notifier_block cpuset_track_online_nodes_nb = { - .notifier_call = cpuset_track_online_nodes, - .priority = 10, /* ??! */ -}; - /** * cpuset_init_smp - initialize cpus_allowed * @@ -3351,28 +4167,73 @@ static struct notifier_block cpuset_track_online_nodes_nb = { */ void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - top_cpuset.mems_allowed = node_states[N_MEMORY]; + /* + * cpus_allowd/mems_allowed set to v2 values in the initial + * cpuset_bind() call will be reset to v1 values in another + * cpuset_bind() call when v1 cpuset is mounted. + */ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); } +/* + * Return cpus_allowed mask from a task's cpuset. + */ +static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) +{ + struct cpuset *cs; + + cs = task_cs(tsk); + if (cs != &top_cpuset) + guarantee_active_cpus(tsk, pmask); + /* + * Tasks in the top cpuset won't get update to their cpumasks + * when a hotplug online/offline event happens. So we include all + * offline cpus in the allowed cpu list. + */ + if ((cs == &top_cpuset) || cpumask_empty(pmask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + + /* + * We first exclude cpus allocated to partitions. If there is no + * allowable online cpu left, we fall back to all possible cpus. + */ + cpumask_andnot(pmask, possible_mask, subpartitions_cpus); + if (!cpumask_intersects(pmask, cpu_active_mask)) + cpumask_copy(pmask, possible_mask); + } +} + +/** + * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Similir to cpuset_cpus_allowed() except that the caller must have acquired + * cpuset_mutex. + */ +void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) +{ + lockdep_assert_held(&cpuset_mutex); + __cpuset_cpus_allowed_locked(tsk, pmask); +} + /** - * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. + * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_mask, even if this means going outside the - * tasks cpuset. + * subset of cpu_active_mask, even if this means going outside the + * tasks cpuset, except when the task is in the top cpuset. **/ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) @@ -3380,7 +4241,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - guarantee_online_cpus(tsk, pmask); + __cpuset_cpus_allowed_locked(tsk, pmask); spin_unlock_irqrestore(&callback_lock, flags); } @@ -3407,7 +4268,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { - do_set_cpus_allowed(tsk, cs_mask); + set_cpus_allowed_force(tsk, cs_mask); changed = true; } rcu_read_unlock(); @@ -3453,9 +4314,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return mask; @@ -3485,8 +4344,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) return cs; } -/** - * cpuset_node_allowed - Can we allocate on a memory node? +/* + * cpuset_current_node_allowed - Can current task allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -3525,7 +4384,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -bool __cpuset_node_allowed(int node, gfp_t gfp_mask) +bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ bool allowed; /* is allocation in zone z allowed? */ @@ -3550,18 +4409,52 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return allowed; } +bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +{ + struct cgroup_subsys_state *css; + struct cpuset *cs; + bool allowed; + + /* + * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy + * and mems_allowed is likely to be empty even if we could get to it, + * so return true to avoid taking a global lock on the empty check. + */ + if (!cpuset_v2()) + return true; + + css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); + if (!css) + return true; + + /* + * Normally, accessing effective_mems would require the cpuset_mutex + * or callback_lock - but node_isset is atomic and the reference + * taken via cgroup_get_e_css is sufficient to protect css. + * + * Since this interface is intended for use by migration paths, we + * relax locking here to avoid taking global locks - while accepting + * there may be rare scenarios where the result may be innaccurate. + * + * Reclaim and migration are subject to these same race conditions, and + * cannot make strong isolation guarantees, so this is acceptable. + */ + cs = container_of(css, struct cpuset, css); + allowed = node_isset(nid, cs->effective_mems); + css_put(css); + return allowed; +} + /** - * cpuset_mem_spread_node() - On which node to begin search for a file page - * cpuset_slab_spread_node() - On which node to begin search for a slab page + * cpuset_spread_node() - On which node to begin search for a page + * @rotor: round robin rotor * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), @@ -3585,12 +4478,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) * is passed an offline node, it will fall back to the local node. * See kmem_cache_alloc_node(). */ - static int cpuset_spread_node(int *rotor) { return *rotor = next_node_in(*rotor, current->mems_allowed); } +/** + * cpuset_mem_spread_node() - On which node to begin search for a file page + */ int cpuset_mem_spread_node(void) { if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) @@ -3600,17 +4495,6 @@ int cpuset_mem_spread_node(void) return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); } -int cpuset_slab_spread_node(void) -{ - if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) - current->cpuset_slab_spread_rotor = - node_random(¤t->mems_allowed); - - return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); -} - -EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); - /** * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? * @tsk1: pointer to task_struct of some task. @@ -3649,79 +4533,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -/* - * Collection of memory_pressure is suppressed unless - * this flag is enabled by writing "1" to the special - * cpuset file 'memory_pressure_enabled' in the root cpuset. - */ - -int cpuset_memory_pressure_enabled __read_mostly; - -/** - * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. - * - * Keep a running average of the rate of synchronous (direct) - * page reclaim efforts initiated by tasks in each cpuset. - * - * This represents the rate at which some task in the cpuset - * ran low on memory on all nodes it was allowed to use, and - * had to enter the kernels page reclaim code in an effort to - * create more free memory by tossing clean pages or swapping - * or writing dirty pages. - * - * Display to user space in the per-cpuset read-only file - * "memory_pressure". Value displayed is an integer - * representing the recent rate of entry into the synchronous - * (direct) page reclaim by any task attached to the cpuset. - **/ - -void __cpuset_memory_pressure_bump(void) -{ - rcu_read_lock(); - fmeter_markevent(&task_cs(current)->fmeter); - rcu_read_unlock(); -} - -#ifdef CONFIG_PROC_PID_CPUSET -/* - * proc_cpuset_show() - * - Print tasks cpuset path into seq_file. - * - Used for /proc/<pid>/cpuset. - * - No need to task_lock(tsk) on this tsk->cpuset reference, as it - * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_rwsem, keeping cpuset_attach() from changing it - * anyway. - */ -int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - char *buf; - struct cgroup_subsys_state *css; - int retval; - - retval = -ENOMEM; - buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!buf) - goto out; - - css = task_get_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); - if (retval >= PATH_MAX) - retval = -ENAMETOOLONG; - if (retval < 0) - goto out_free; - seq_puts(m, buf); - seq_putc(m, '\n'); - retval = 0; -out_free: - kfree(buf); -out: - return retval; -} -#endif /* CONFIG_PROC_PID_CPUSET */ - /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 80aa3f027ac3..81ea38dd6f9d 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) return -ENODEV; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); @@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, css, css->id); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); cgroup_kn_unlock(of->kn); return 0; @@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) return -ENOMEM; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name_buf); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); kfree(name_buf); return 0; diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c new file mode 100644 index 000000000000..e12b946278b6 --- /dev/null +++ b/kernel/cgroup/dmem.c @@ -0,0 +1,830 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>) + * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>) + * Partially based on the rdma and misc controllers, which bear the following copyrights: + * + * Copyright 2020 Google LLC + * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> + */ + +#include <linux/cgroup.h> +#include <linux/cgroup_dmem.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/page_counter.h> +#include <linux/parser.h> +#include <linux/rculist.h> +#include <linux/slab.h> + +struct dmem_cgroup_region { + /** + * @ref: References keeping the region alive. + * Keeps the region reference alive after a succesful RCU lookup. + */ + struct kref ref; + + /** @rcu: RCU head for freeing */ + struct rcu_head rcu; + + /** + * @region_node: Linked into &dmem_cgroup_regions list. + * Protected by RCU and global spinlock. + */ + struct list_head region_node; + + /** + * @pools: List of pools linked to this region. + * Protected by global spinlock only + */ + struct list_head pools; + + /** @size: Size of region, in bytes */ + u64 size; + + /** @name: Name describing the node, set by dmem_cgroup_register_region */ + char *name; + + /** + * @unregistered: Whether the region is unregistered by its caller. + * No new pools should be added to the region afterwards. + */ + bool unregistered; +}; + +struct dmemcg_state { + struct cgroup_subsys_state css; + + struct list_head pools; +}; + +struct dmem_cgroup_pool_state { + struct dmem_cgroup_region *region; + struct dmemcg_state *cs; + + /* css node, RCU protected against region teardown */ + struct list_head css_node; + + /* dev node, no RCU protection required */ + struct list_head region_node; + + struct rcu_head rcu; + + struct page_counter cnt; + + bool inited; +}; + +/* + * 3 operations require locking protection: + * - Registering and unregistering region to/from list, requires global lock. + * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed. + * - Adding a dmem_cgroup_pool_state to a region list. + * + * Since for the most common operations RCU provides enough protection, I + * do not think more granular locking makes sense. Most protection is offered + * by RCU and the lockless operating page_counter. + */ +static DEFINE_SPINLOCK(dmemcg_lock); +static LIST_HEAD(dmem_cgroup_regions); + +static inline struct dmemcg_state * +css_to_dmemcs(struct cgroup_subsys_state *css) +{ + return container_of(css, struct dmemcg_state, css); +} + +static inline struct dmemcg_state *get_current_dmemcs(void) +{ + return css_to_dmemcs(task_get_css(current, dmem_cgrp_id)); +} + +static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg) +{ + return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL; +} + +static void free_cg_pool(struct dmem_cgroup_pool_state *pool) +{ + list_del(&pool->region_node); + kfree(pool); +} + +static void +set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_min(&pool->cnt, val); +} + +static void +set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_low(&pool->cnt, val); +} + +static void +set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_max(&pool->cnt, val); +} + +static u64 get_resource_low(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.low) : 0; +} + +static u64 get_resource_min(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.min) : 0; +} + +static u64 get_resource_max(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX; +} + +static u64 get_resource_current(struct dmem_cgroup_pool_state *pool) +{ + return pool ? page_counter_read(&pool->cnt) : 0; +} + +static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool) +{ + set_resource_min(rpool, 0); + set_resource_low(rpool, 0); + set_resource_max(rpool, PAGE_COUNTER_MAX); +} + +static void dmemcs_offline(struct cgroup_subsys_state *css) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(css); + struct dmem_cgroup_pool_state *pool; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &dmemcs->pools, css_node) + reset_all_resource_limits(pool); + rcu_read_unlock(); +} + +static void dmemcs_free(struct cgroup_subsys_state *css) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(css); + struct dmem_cgroup_pool_state *pool, *next; + + spin_lock(&dmemcg_lock); + list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) { + /* + *The pool is dead and all references are 0, + * no need for RCU protection with list_del_rcu or freeing. + */ + list_del(&pool->css_node); + free_cg_pool(pool); + } + spin_unlock(&dmemcg_lock); + + kfree(dmemcs); +} + +static struct cgroup_subsys_state * +dmemcs_alloc(struct cgroup_subsys_state *parent_css) +{ + struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL); + if (!dmemcs) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&dmemcs->pools); + return &dmemcs->css; +} + +static struct dmem_cgroup_pool_state * +find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region) +{ + struct dmem_cgroup_pool_state *pool; + + list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock)) + if (pool->region == region) + return pool; + + return NULL; +} + +static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool) +{ + if (!pool->cnt.parent) + return NULL; + + return container_of(pool->cnt.parent, typeof(*pool), cnt); +} + +static void +dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool) +{ + struct page_counter *climit; + struct cgroup_subsys_state *css; + struct dmemcg_state *dmemcg_iter; + struct dmem_cgroup_pool_state *pool, *found_pool; + + climit = &limit_pool->cnt; + + rcu_read_lock(); + + css_for_each_descendant_pre(css, &limit_pool->cs->css) { + dmemcg_iter = container_of(css, struct dmemcg_state, css); + found_pool = NULL; + + list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) { + if (pool->region == limit_pool->region) { + found_pool = pool; + break; + } + } + if (!found_pool) + continue; + + page_counter_calculate_protection( + climit, &found_pool->cnt, true); + + if (found_pool == test_pool) + break; + } + rcu_read_unlock(); +} + +/** + * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool + * @limit_pool: The pool for which we hit limits + * @test_pool: The pool for which to test + * @ignore_low: Whether we have to respect low watermarks. + * @ret_hit_low: Pointer to whether it makes sense to consider low watermark. + * + * This function returns true if we can evict from @test_pool, false if not. + * When returning false and @ignore_low is false, @ret_hit_low may + * be set to true to indicate this function can be retried with @ignore_low + * set to true. + * + * Return: bool + */ +bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool, + bool ignore_low, bool *ret_hit_low) +{ + struct dmem_cgroup_pool_state *pool = test_pool; + struct page_counter *ctest; + u64 used, min, low; + + /* Can always evict from current pool, despite limits */ + if (limit_pool == test_pool) + return true; + + if (limit_pool) { + if (!parent_dmemcs(limit_pool->cs)) + return true; + + for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool)) + {} + + if (!pool) + return false; + } else { + /* + * If there is no cgroup limiting memory usage, use the root + * cgroup instead for limit calculations. + */ + for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool)) + {} + } + + ctest = &test_pool->cnt; + + dmem_cgroup_calculate_protection(limit_pool, test_pool); + + used = page_counter_read(ctest); + min = READ_ONCE(ctest->emin); + + if (used <= min) + return false; + + if (!ignore_low) { + low = READ_ONCE(ctest->elow); + if (used > low) + return true; + + *ret_hit_low = true; + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable); + +static struct dmem_cgroup_pool_state * +alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, + struct dmem_cgroup_pool_state **allocpool) +{ + struct dmemcg_state *parent = parent_dmemcs(dmemcs); + struct dmem_cgroup_pool_state *pool, *ppool = NULL; + + if (!*allocpool) { + pool = kzalloc(sizeof(*pool), GFP_NOWAIT); + if (!pool) + return ERR_PTR(-ENOMEM); + } else { + pool = *allocpool; + *allocpool = NULL; + } + + pool->region = region; + pool->cs = dmemcs; + + if (parent) + ppool = find_cg_pool_locked(parent, region); + + page_counter_init(&pool->cnt, + ppool ? &ppool->cnt : NULL, true); + reset_all_resource_limits(pool); + + list_add_tail_rcu(&pool->css_node, &dmemcs->pools); + list_add_tail(&pool->region_node, ®ion->pools); + + if (!parent) + pool->inited = true; + else + pool->inited = ppool ? ppool->inited : false; + return pool; +} + +static struct dmem_cgroup_pool_state * +get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, + struct dmem_cgroup_pool_state **allocpool) +{ + struct dmem_cgroup_pool_state *pool, *ppool, *retpool; + struct dmemcg_state *p, *pp; + + /* + * Recursively create pool, we may not initialize yet on + * recursion, this is done as a separate step. + */ + for (p = dmemcs; p; p = parent_dmemcs(p)) { + pool = find_cg_pool_locked(p, region); + if (!pool) + pool = alloc_pool_single(p, region, allocpool); + + if (IS_ERR(pool)) + return pool; + + if (p == dmemcs && pool->inited) + return pool; + + if (pool->inited) + break; + } + + retpool = pool = find_cg_pool_locked(dmemcs, region); + for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) { + if (pool->inited) + break; + + /* ppool was created if it didn't exist by above loop. */ + ppool = find_cg_pool_locked(pp, region); + + /* Fix up parent links, mark as inited. */ + pool->cnt.parent = &ppool->cnt; + pool->inited = true; + + pool = ppool; + } + + return retpool; +} + +static void dmemcg_free_rcu(struct rcu_head *rcu) +{ + struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu); + struct dmem_cgroup_pool_state *pool, *next; + + list_for_each_entry_safe(pool, next, ®ion->pools, region_node) + free_cg_pool(pool); + kfree(region->name); + kfree(region); +} + +static void dmemcg_free_region(struct kref *ref) +{ + struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref); + + call_rcu(&cgregion->rcu, dmemcg_free_rcu); +} + +/** + * dmem_cgroup_unregister_region() - Unregister a previously registered region. + * @region: The region to unregister. + * + * This function undoes dmem_cgroup_register_region. + */ +void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) +{ + struct list_head *entry; + + if (!region) + return; + + spin_lock(&dmemcg_lock); + + /* Remove from global region list */ + list_del_rcu(®ion->region_node); + + list_for_each_rcu(entry, ®ion->pools) { + struct dmem_cgroup_pool_state *pool = + container_of(entry, typeof(*pool), region_node); + + list_del_rcu(&pool->css_node); + } + + /* + * Ensure any RCU based lookups fail. Additionally, + * no new pools should be added to the dead region + * by get_cg_pool_unlocked. + */ + region->unregistered = true; + spin_unlock(&dmemcg_lock); + + kref_put(®ion->ref, dmemcg_free_region); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region); + +/** + * dmem_cgroup_register_region() - Register a regions for dev cgroup. + * @size: Size of region to register, in bytes. + * @fmt: Region parameters to register + * + * This function registers a node in the dmem cgroup with the + * name given. After calling this function, the region can be + * used for allocations. + * + * Return: NULL or a struct on success, PTR_ERR on failure. + */ +struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...) +{ + struct dmem_cgroup_region *ret; + char *region_name; + va_list ap; + + if (!size) + return NULL; + + va_start(ap, fmt); + region_name = kvasprintf(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!region_name) + return ERR_PTR(-ENOMEM); + + ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) { + kfree(region_name); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&ret->pools); + ret->name = region_name; + ret->size = size; + kref_init(&ret->ref); + + spin_lock(&dmemcg_lock); + list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions); + spin_unlock(&dmemcg_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_register_region); + +static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name) +{ + struct dmem_cgroup_region *region; + + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock)) + if (!strcmp(name, region->name) && + kref_get_unless_zero(®ion->ref)) + return region; + + return NULL; +} + +/** + * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state + * @pool: &dmem_cgroup_pool_state + * + * Called to drop a reference to the limiting pool returned by + * dmem_cgroup_try_charge(). + */ +void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) +{ + if (pool) + css_put(&pool->cs->css); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put); + +static struct dmem_cgroup_pool_state * +get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) +{ + struct dmem_cgroup_pool_state *pool, *allocpool = NULL; + + /* fastpath lookup? */ + rcu_read_lock(); + pool = find_cg_pool_locked(cg, region); + if (pool && !READ_ONCE(pool->inited)) + pool = NULL; + rcu_read_unlock(); + + while (!pool) { + spin_lock(&dmemcg_lock); + if (!region->unregistered) + pool = get_cg_pool_locked(cg, region, &allocpool); + else + pool = ERR_PTR(-ENODEV); + spin_unlock(&dmemcg_lock); + + if (pool == ERR_PTR(-ENOMEM)) { + pool = NULL; + if (WARN_ON(allocpool)) + continue; + + allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL); + if (allocpool) { + pool = NULL; + continue; + } + } + } + + kfree(allocpool); + return pool; +} + +/** + * dmem_cgroup_uncharge() - Uncharge a pool. + * @pool: Pool to uncharge. + * @size: Size to uncharge. + * + * Undoes the effects of dmem_cgroup_try_charge. + * Must be called with the returned pool as argument, + * and same @index and @size. + */ +void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) +{ + if (!pool) + return; + + page_counter_uncharge(&pool->cnt, size); + css_put(&pool->cs->css); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge); + +/** + * dmem_cgroup_try_charge() - Try charging a new allocation to a region. + * @region: dmem region to charge + * @size: Size (in bytes) to charge. + * @ret_pool: On succesfull allocation, the pool that is charged. + * @ret_limit_pool: On a failed allocation, the limiting pool. + * + * This function charges the @region region for a size of @size bytes. + * + * If the function succeeds, @ret_pool is set, which must be passed to + * dmem_cgroup_uncharge() when undoing the allocation. + * + * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it + * will be set to the pool for which the limit is hit. This can be used for + * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed + * with @dmem_cgroup_pool_state_put(). + * + * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure. + */ +int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool) +{ + struct dmemcg_state *cg; + struct dmem_cgroup_pool_state *pool; + struct page_counter *fail; + int ret; + + *ret_pool = NULL; + if (ret_limit_pool) + *ret_limit_pool = NULL; + + /* + * hold on to css, as cgroup can be removed but resource + * accounting happens on css. + */ + cg = get_current_dmemcs(); + + pool = get_cg_pool_unlocked(cg, region); + if (IS_ERR(pool)) { + ret = PTR_ERR(pool); + goto err; + } + + if (!page_counter_try_charge(&pool->cnt, size, &fail)) { + if (ret_limit_pool) { + *ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt); + css_get(&(*ret_limit_pool)->cs->css); + } + ret = -EAGAIN; + goto err; + } + + /* On success, reference from get_current_dmemcs is transferred to *ret_pool */ + *ret_pool = pool; + return 0; + +err: + css_put(&cg->css); + return ret; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge); + +static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v) +{ + struct dmem_cgroup_region *region; + + rcu_read_lock(); + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { + seq_puts(sf, region->name); + seq_printf(sf, " %llu\n", region->size); + } + rcu_read_unlock(); + return 0; +} + +static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region, + u64 *new_limit) +{ + char *end; + + if (!strcmp(options, "max")) { + *new_limit = PAGE_COUNTER_MAX; + return 0; + } + + *new_limit = memparse(options, &end); + if (*end != '\0') + return -EINVAL; + + return 0; +} + +static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, + void (*apply)(struct dmem_cgroup_pool_state *, u64)) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of)); + int err = 0; + + while (buf && !err) { + struct dmem_cgroup_pool_state *pool = NULL; + char *options, *region_name; + struct dmem_cgroup_region *region; + u64 new_limit; + + options = buf; + buf = strchr(buf, '\n'); + if (buf) + *buf++ = '\0'; + + options = strstrip(options); + + /* eat empty lines */ + if (!options[0]) + continue; + + region_name = strsep(&options, " \t"); + if (!region_name[0]) + continue; + + rcu_read_lock(); + region = dmemcg_get_region_by_name(region_name); + rcu_read_unlock(); + + if (!region) + return -EINVAL; + + err = dmemcg_parse_limit(options, region, &new_limit); + if (err < 0) + goto out_put; + + pool = get_cg_pool_unlocked(dmemcs, region); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + goto out_put; + } + + /* And commit */ + apply(pool, new_limit); + +out_put: + kref_put(®ion->ref, dmemcg_free_region); + } + + + return err ?: nbytes; +} + +static int dmemcg_limit_show(struct seq_file *sf, void *v, + u64 (*fn)(struct dmem_cgroup_pool_state *)) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf)); + struct dmem_cgroup_region *region; + + rcu_read_lock(); + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { + struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region); + u64 val; + + seq_puts(sf, region->name); + + val = fn(pool); + if (val < PAGE_COUNTER_MAX) + seq_printf(sf, " %lld\n", val); + else + seq_puts(sf, " max\n"); + } + rcu_read_unlock(); + + return 0; +} + +static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_current); +} + +static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_min); +} + +static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min); +} + +static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_low); +} + +static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low); +} + +static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_max); +} + +static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max); +} + +static struct cftype files[] = { + { + .name = "capacity", + .seq_show = dmem_cgroup_region_capacity_show, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + { + .name = "current", + .seq_show = dmem_cgroup_region_current_show, + }, + { + .name = "min", + .write = dmem_cgroup_region_min_write, + .seq_show = dmem_cgroup_region_min_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "low", + .write = dmem_cgroup_region_low_write, + .seq_show = dmem_cgroup_region_low_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "max", + .write = dmem_cgroup_region_max_write, + .seq_show = dmem_cgroup_region_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* Zero entry terminates. */ +}; + +struct cgroup_subsys dmem_cgrp_subsys = { + .css_alloc = dmemcs_alloc, + .css_free = dmemcs_free, + .css_offline = dmemcs_offline, + .legacy_cftypes = files, + .dfl_cftypes = files, +}; diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 3984dd6b8ddb..6c18854bff34 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -1,4 +1,4 @@ -//SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -9,6 +9,28 @@ #include <trace/events/cgroup.h> /* + * Update CGRP_FROZEN of cgroup.flag + * Return true if flags is updated; false if flags has no change + */ +static bool cgroup_update_frozen_flag(struct cgroup *cgrp, bool frozen) +{ + lockdep_assert_held(&css_set_lock); + + /* Already there? */ + if (test_bit(CGRP_FROZEN, &cgrp->flags) == frozen) + return false; + + if (frozen) + set_bit(CGRP_FROZEN, &cgrp->flags); + else + clear_bit(CGRP_FROZEN, &cgrp->flags); + + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); + return true; +} + +/* * Propagate the cgroup frozen state upwards by the cgroup tree. */ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) @@ -24,24 +46,16 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) while ((cgrp = cgroup_parent(cgrp))) { if (frozen) { cgrp->freezer.nr_frozen_descendants += desc; - if (!test_bit(CGRP_FROZEN, &cgrp->flags) && - test_bit(CGRP_FREEZE, &cgrp->flags) && - cgrp->freezer.nr_frozen_descendants == - cgrp->nr_descendants) { - set_bit(CGRP_FROZEN, &cgrp->flags); - cgroup_file_notify(&cgrp->events_file); - TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); - desc++; - } + if (!test_bit(CGRP_FREEZE, &cgrp->flags) || + (cgrp->freezer.nr_frozen_descendants != + cgrp->nr_descendants)) + continue; } else { cgrp->freezer.nr_frozen_descendants -= desc; - if (test_bit(CGRP_FROZEN, &cgrp->flags)) { - clear_bit(CGRP_FROZEN, &cgrp->flags); - cgroup_file_notify(&cgrp->events_file); - TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); - desc++; - } } + + if (cgroup_update_frozen_flag(cgrp, frozen)) + desc++; } } @@ -53,8 +67,6 @@ void cgroup_update_frozen(struct cgroup *cgrp) { bool frozen; - lockdep_assert_held(&css_set_lock); - /* * If the cgroup has to be frozen (CGRP_FREEZE bit set), * and all tasks are frozen and/or stopped, let's consider @@ -63,24 +75,9 @@ void cgroup_update_frozen(struct cgroup *cgrp) frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); - if (frozen) { - /* Already there? */ - if (test_bit(CGRP_FROZEN, &cgrp->flags)) - return; - - set_bit(CGRP_FROZEN, &cgrp->flags); - } else { - /* Already there? */ - if (!test_bit(CGRP_FROZEN, &cgrp->flags)) - return; - - clear_bit(CGRP_FROZEN, &cgrp->flags); - } - cgroup_file_notify(&cgrp->events_file); - TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); - - /* Update the state of ancestor cgroups. */ - cgroup_propagate_frozen(cgrp, frozen); + /* If flags is updated, update the state of ancestor cgroups. */ + if (cgroup_update_frozen_flag(cgrp, frozen)) + cgroup_propagate_frozen(cgrp, frozen); } /* @@ -174,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze) /* * Freeze or unfreeze all tasks in the given cgroup. */ -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) { struct css_task_iter it; struct task_struct *task; @@ -182,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - if (freeze) + write_seqcount_begin(&cgrp->freezer.freeze_seq); + if (freeze) { set_bit(CGRP_FREEZE, &cgrp->flags); - else + cgrp->freezer.freeze_start_nsec = ts_nsec; + } else { clear_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.frozen_nsec += (ts_nsec - + cgrp->freezer.freeze_start_nsec); + } + write_seqcount_end(&cgrp->freezer.freeze_seq); spin_unlock_irq(&css_set_lock); if (freeze) @@ -260,8 +263,11 @@ void cgroup_freezer_migrate_task(struct task_struct *task, void cgroup_freeze(struct cgroup *cgrp, bool freeze) { struct cgroup_subsys_state *css; + struct cgroup *parent; struct cgroup *dsct; bool applied = false; + u64 ts_nsec; + bool old_e; lockdep_assert_held(&cgroup_mutex); @@ -272,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) return; cgrp->freezer.freeze = freeze; + ts_nsec = ktime_get_ns(); /* * Propagate changes downwards the cgroup tree. @@ -282,28 +289,24 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) if (cgroup_is_dead(dsct)) continue; - if (freeze) { - dsct->freezer.e_freeze++; - /* - * Already frozen because of ancestor's settings? - */ - if (dsct->freezer.e_freeze > 1) - continue; - } else { - dsct->freezer.e_freeze--; - /* - * Still frozen because of ancestor's settings? - */ - if (dsct->freezer.e_freeze > 0) - continue; - - WARN_ON_ONCE(dsct->freezer.e_freeze < 0); + /* + * e_freeze is affected by parent's e_freeze and dst's freeze. + * If old e_freeze eq new e_freeze, no change, its children + * will not be affected. So do nothing and skip the subtree + */ + old_e = dsct->freezer.e_freeze; + parent = cgroup_parent(dsct); + dsct->freezer.e_freeze = (dsct->freezer.freeze || + parent->freezer.e_freeze); + if (dsct->freezer.e_freeze == old_e) { + css = css_rightmost_descendant(css); + continue; } /* * Do change actual state: freeze or unfreeze. */ - cgroup_do_freeze(dsct, freeze); + cgroup_do_freeze(dsct, freeze, ts_nsec); applied = true; } diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 08236798d173..915b02f65980 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -22,6 +22,7 @@ #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/mutex.h> +#include <linux/cpu.h> /* * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is @@ -62,7 +63,7 @@ static struct freezer *parent_freezer(struct freezer *freezer) return css_freezer(freezer->css.parent); } -bool cgroup_freezing(struct task_struct *task) +bool cgroup1_freezing(struct task_struct *task) { bool ret; @@ -99,24 +100,25 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) * @css: css being created * * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. + * parent's freezing state while holding cpus read lock and freezer_mutex. */ static int freezer_css_online(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); + cpus_read_lock(); mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - atomic_inc(&system_freezing_cnt); + static_branch_inc_cpuslocked(&freezer_active); } mutex_unlock(&freezer_mutex); + cpus_read_unlock(); return 0; } @@ -124,21 +126,23 @@ static int freezer_css_online(struct cgroup_subsys_state *css) * freezer_css_offline - initiate destruction of a freezer css * @css: css being destroyed * - * @css is going away. Mark it dead and decrement system_freezing_count if + * @css is going away. Mark it dead and decrement freezer_active if * it was holding one. */ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); + cpus_read_lock(); mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); + static_branch_dec_cpuslocked(&freezer_active); freezer->state = 0; mutex_unlock(&freezer_mutex); + cpus_read_unlock(); } static void freezer_css_free(struct cgroup_subsys_state *css) @@ -178,12 +182,12 @@ static void freezer_attach(struct cgroup_taskset *tset) if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { - freeze_task(task); /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; freezer = parent_freezer(freezer); } + freeze_task(task); } } @@ -271,16 +275,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css) css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { - if (freezing(task)) { - /* - * freezer_should_skip() indicates that the task - * should be skipped when determining freezing - * completion. Consider it frozen in addition to - * the usual frozen condition. - */ - if (!frozen(task) && !freezer_should_skip(task)) - goto out_iter_end; - } + if (freezing(task) && !frozen(task)) + goto out_iter_end; } freezer->state |= CGROUP_FROZEN; @@ -357,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) - atomic_inc(&system_freezing_cnt); + static_branch_inc_cpuslocked(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { @@ -366,9 +362,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, freezer->state &= ~state; if (!(freezer->state & CGROUP_FREEZING)) { - if (was_freezing) - atomic_dec(&system_freezing_cnt); freezer->state &= ~CGROUP_FROZEN; + if (was_freezing) + static_branch_dec_cpuslocked(&freezer_active); unfreeze_cgroup(freezer); } } @@ -386,6 +382,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) { struct cgroup_subsys_state *pos; + cpus_read_lock(); /* * Update all its descendants in pre-order traversal. Each * descendant will try to inherit its parent's FREEZING state as @@ -414,6 +411,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) } rcu_read_unlock(); mutex_unlock(&freezer_mutex); + cpus_read_unlock(); } static ssize_t freezer_write(struct kernfs_open_file *of, @@ -425,9 +423,11 @@ static ssize_t freezer_write(struct kernfs_open_file *of, if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) { + pr_info_once("Freezing with imperfect legacy cgroup freezer. " + "See cgroup.freeze of cgroup v2\n"); freeze = true; - else + } else return -EINVAL; freezer_change_state(css_freezer(of_css(of)), freeze); diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index fe3e8a0eb7ed..6a01d91ea4cb 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -14,7 +14,7 @@ #include <linux/misc_cgroup.h> #define MAX_STR "max" -#define MAX_NUM ULONG_MAX +#define MAX_NUM U64_MAX /* Miscellaneous res name, keep it in sync with enum misc_res_type */ static const char *const misc_res_name[] = { @@ -24,6 +24,10 @@ static const char *const misc_res_name[] = { /* AMD SEV-ES ASIDs resource */ "sev_es", #endif +#ifdef CONFIG_INTEL_TDX_HOST + /* Intel TDX HKIDs resource */ + "tdx", +#endif }; /* Root misc cgroup */ @@ -37,7 +41,7 @@ static struct misc_cg root_cg; * more than the actual capacity. We are using Limits resource distribution * model of cgroup for miscellaneous controller. */ -static unsigned long misc_res_capacity[MISC_CG_RES_TYPES]; +static u64 misc_res_capacity[MISC_CG_RES_TYPES]; /** * parent_misc() - Get the parent of the passed misc cgroup. @@ -68,22 +72,6 @@ static inline bool valid_type(enum misc_res_type type) } /** - * misc_cg_res_total_usage() - Get the current total usage of the resource. - * @type: misc res type. - * - * Context: Any context. - * Return: Current total usage of the resource. - */ -unsigned long misc_cg_res_total_usage(enum misc_res_type type) -{ - if (valid_type(type)) - return atomic_long_read(&root_cg.res[type].usage); - - return 0; -} -EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); - -/** * misc_cg_set_capacity() - Set the capacity of the misc cgroup res. * @type: Type of the misc res. * @capacity: Supported capacity of the misc res on the host. @@ -95,7 +83,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); * * %0 - Successfully registered the capacity. * * %-EINVAL - If @type is invalid. */ -int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity) +int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { if (!valid_type(type)) return -EINVAL; @@ -114,13 +102,37 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity); * Context: Any context. */ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) + u64 amount) { - WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage), + WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage), "misc cgroup resource %s became less than 0", misc_res_name[type]); } +static void misc_cg_update_watermark(struct misc_res *res, u64 new_usage) +{ + u64 old; + + while (true) { + old = atomic64_read(&res->watermark); + if (new_usage <= old) + break; + if (atomic64_cmpxchg(&res->watermark, old, new_usage) == old) + break; + } +} + +static void misc_cg_event(enum misc_res_type type, struct misc_cg *cg) +{ + atomic64_inc(&cg->res[type].events_local); + cgroup_file_notify(&cg->events_local_file); + + for (; parent_misc(cg); cg = parent_misc(cg)) { + atomic64_inc(&cg->res[type].events); + cgroup_file_notify(&cg->events_file); + } +} + /** * misc_cg_try_charge() - Try charging the misc cgroup. * @type: Misc res type to charge. @@ -137,13 +149,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, * * -EBUSY - If max limit will be crossed or total usage will be more than the * capacity. */ -int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i, *j; int ret; struct misc_res *res; - int new_usage; + u64 new_usage; if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type]))) return -EINVAL; @@ -154,20 +165,18 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, for (i = cg; i; i = parent_misc(i)) { res = &i->res[type]; - new_usage = atomic_long_add_return(amount, &res->usage); + new_usage = atomic64_add_return(amount, &res->usage); if (new_usage > READ_ONCE(res->max) || new_usage > READ_ONCE(misc_res_capacity[type])) { ret = -EBUSY; goto err_charge; } + misc_cg_update_watermark(res, new_usage); } return 0; err_charge: - for (j = i; j; j = parent_misc(j)) { - atomic_long_inc(&j->res[type].events); - cgroup_file_notify(&j->events_file); - } + misc_cg_event(type, i); for (j = cg; j != i; j = parent_misc(j)) misc_cg_cancel_charge(type, j, amount); @@ -184,8 +193,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge); * * Context: Any context. */ -void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i; @@ -209,7 +217,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) { int i; struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long max; + u64 max; for (i = 0; i < MISC_CG_RES_TYPES; i++) { if (READ_ONCE(misc_res_capacity[i])) { @@ -217,7 +225,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) if (max == MAX_NUM) seq_printf(sf, "%s max\n", misc_res_name[i]); else - seq_printf(sf, "%s %lu\n", misc_res_name[i], + seq_printf(sf, "%s %llu\n", misc_res_name[i], max); } } @@ -241,13 +249,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) * Return: * * >= 0 - Number of bytes processed in the input. * * -EINVAL - If buf is not valid. - * * -ERANGE - If number is bigger than the unsigned long capacity. + * * -ERANGE - If number is bigger than the u64 capacity. */ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct misc_cg *cg; - unsigned long max; + u64 max; int ret = 0, i; enum misc_res_type type = MISC_CG_RES_TYPES; char *token; @@ -271,7 +279,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, if (!strcmp(MAX_STR, buf)) { max = MAX_NUM; } else { - ret = kstrtoul(buf, 0, &max); + ret = kstrtou64(buf, 0, &max); if (ret) return ret; } @@ -297,13 +305,36 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, static int misc_cg_current_show(struct seq_file *sf, void *v) { int i; - unsigned long usage; + u64 usage; struct misc_cg *cg = css_misc(seq_css(sf)); for (i = 0; i < MISC_CG_RES_TYPES; i++) { - usage = atomic_long_read(&cg->res[i].usage); + usage = atomic64_read(&cg->res[i].usage); if (READ_ONCE(misc_res_capacity[i]) || usage) - seq_printf(sf, "%s %lu\n", misc_res_name[i], usage); + seq_printf(sf, "%s %llu\n", misc_res_name[i], usage); + } + + return 0; +} + +/** + * misc_cg_peak_show() - Show the peak usage of the misc cgroup. + * @sf: Interface file + * @v: Arguments passed + * + * Context: Any context. + * Return: 0 to denote successful print. + */ +static int misc_cg_peak_show(struct seq_file *sf, void *v) +{ + int i; + u64 watermark; + struct misc_cg *cg = css_misc(seq_css(sf)); + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + watermark = atomic64_read(&cg->res[i].watermark); + if (READ_ONCE(misc_res_capacity[i]) || watermark) + seq_printf(sf, "%s %llu\n", misc_res_name[i], watermark); } return 0; @@ -322,30 +353,44 @@ static int misc_cg_current_show(struct seq_file *sf, void *v) static int misc_cg_capacity_show(struct seq_file *sf, void *v) { int i; - unsigned long cap; + u64 cap; for (i = 0; i < MISC_CG_RES_TYPES; i++) { cap = READ_ONCE(misc_res_capacity[i]); if (cap) - seq_printf(sf, "%s %lu\n", misc_res_name[i], cap); + seq_printf(sf, "%s %llu\n", misc_res_name[i], cap); } return 0; } -static int misc_events_show(struct seq_file *sf, void *v) +static int __misc_events_show(struct seq_file *sf, bool local) { struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long events, i; + u64 events; + int i; for (i = 0; i < MISC_CG_RES_TYPES; i++) { - events = atomic_long_read(&cg->res[i].events); + if (local) + events = atomic64_read(&cg->res[i].events_local); + else + events = atomic64_read(&cg->res[i].events); if (READ_ONCE(misc_res_capacity[i]) || events) - seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events); + seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events); } return 0; } +static int misc_events_show(struct seq_file *sf, void *v) +{ + return __misc_events_show(sf, false); +} + +static int misc_events_local_show(struct seq_file *sf, void *v) +{ + return __misc_events_show(sf, true); +} + /* Misc cgroup interface files */ static struct cftype misc_cg_files[] = { { @@ -357,7 +402,10 @@ static struct cftype misc_cg_files[] = { { .name = "current", .seq_show = misc_cg_current_show, - .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "peak", + .seq_show = misc_cg_peak_show, }, { .name = "capacity", @@ -370,6 +418,12 @@ static struct cftype misc_cg_files[] = { .file_offset = offsetof(struct misc_cg, events_file), .seq_show = misc_events_show, }, + { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct misc_cg, events_local_file), + .seq_show = misc_events_local_show, + }, {} }; @@ -398,7 +452,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css) for (i = 0; i < MISC_CG_RES_TYPES; i++) { WRITE_ONCE(cg->res[i].max, MAX_NUM); - atomic_long_set(&cg->res[i].usage, 0); + atomic64_set(&cg->res[i].usage, 0); } return &cg->css; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 0d5c29879a50..db9617556dd7 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -5,7 +5,7 @@ #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> - +#include <linux/nstree.h> /* cgroup namespaces */ @@ -21,33 +21,31 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts) static struct cgroup_namespace *alloc_cgroup_ns(void) { - struct cgroup_namespace *new_ns; + struct cgroup_namespace *new_ns __free(kfree) = NULL; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); + ret = ns_common_init(new_ns); + if (ret) return ERR_PTR(ret); - } - refcount_set(&new_ns->ns.count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; + return no_free_ptr(new_ns); } void free_cgroup_ns(struct cgroup_namespace *ns) { + ns_tree_remove(ns); put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } EXPORT_SYMBOL(free_cgroup_ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { @@ -87,14 +85,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, new_ns->ucounts = ucounts; new_ns->root_cset = cset; + ns_tree_add(new_ns); return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -143,15 +137,8 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns) const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", - .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, .owner = cgroupns_owner, }; - -static __init int cgroup_namespaces_init(void) -{ - return 0; -} -subsys_initcall(cgroup_namespaces_init); diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 511af87f685e..8f61114c36dd 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -38,6 +38,14 @@ #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX_STR "max" +enum pidcg_event { + /* Fork failed in subtree because this pids_cgroup limit was hit. */ + PIDCG_MAX, + /* Fork failed in this pids_cgroup because ancestor limit was hit. */ + PIDCG_FORKFAIL, + NR_PIDCG_EVENTS, +}; + struct pids_cgroup { struct cgroup_subsys_state css; @@ -47,12 +55,14 @@ struct pids_cgroup { */ atomic64_t counter; atomic64_t limit; + int64_t watermark; - /* Handle for "pids.events" */ + /* Handles for pids.events[.local] */ struct cgroup_file events_file; + struct cgroup_file events_local_file; - /* Number of times fork failed because limit was hit. */ - atomic64_t events_limit; + atomic64_t events[NR_PIDCG_EVENTS]; + atomic64_t events_local[NR_PIDCG_EVENTS]; }; static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) @@ -74,9 +84,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - atomic64_set(&pids->counter, 0); atomic64_set(&pids->limit, PIDS_MAX); - atomic64_set(&pids->events_limit, 0); return &pids->css; } @@ -85,6 +93,16 @@ static void pids_css_free(struct cgroup_subsys_state *css) kfree(css_pids(css)); } +static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids) +{ + /* + * This is racy, but we don't need perfectly accurate tallying of + * the watermark, and this lets us avoid extra atomic overhead. + */ + if (nr_pids > READ_ONCE(p->watermark)) + WRITE_ONCE(p->watermark, nr_pids); +} + /** * pids_cancel - uncharge the local pid count * @pids: the pid cgroup state @@ -128,20 +146,24 @@ static void pids_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; - for (p = pids; parent_pids(p); p = parent_pids(p)) - atomic64_add(num, &p->counter); + for (p = pids; parent_pids(p); p = parent_pids(p)) { + int64_t new = atomic64_add_return(num, &p->counter); + + pids_update_watermark(p, new); + } } /** * pids_try_charge - hierarchically try to charge the pid count * @pids: the pid cgroup state * @num: the number of pids to charge + * @fail: storage of pid cgroup causing the fail * * This function follows the set limit. It will fail if the charge would cause * the new value to exceed the hierarchical limit. Returns 0 if the charge * succeeded, otherwise -EAGAIN. */ -static int pids_try_charge(struct pids_cgroup *pids, int num) +static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail) { struct pids_cgroup *p, *q; @@ -154,8 +176,15 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > limit) + if (new > limit) { + *fail = p; goto revert; + } + /* + * Not technically accurate if we go over limit somewhere up + * the hierarchy, but that's tolerable for the watermark. + */ + pids_update_watermark(p, new); } return 0; @@ -211,44 +240,54 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) } } +static void pids_event(struct pids_cgroup *pids_forking, + struct pids_cgroup *pids_over_limit) +{ + struct pids_cgroup *p = pids_forking; + + /* Only log the first time limit is hit. */ + if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) { + pr_info("cgroup: fork rejected by pids controller in "); + pr_cont_cgroup_path(p->css.cgroup); + pr_cont("\n"); + } + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) { + cgroup_file_notify(&p->events_local_file); + return; + } + + atomic64_inc(&pids_over_limit->events_local[PIDCG_MAX]); + cgroup_file_notify(&pids_over_limit->events_local_file); + + for (p = pids_over_limit; parent_pids(p); p = parent_pids(p)) { + atomic64_inc(&p->events[PIDCG_MAX]); + cgroup_file_notify(&p->events_file); + } +} + /* * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on cgroup_threadgroup_change_begin() held by the copy_process(). */ static int pids_can_fork(struct task_struct *task, struct css_set *cset) { - struct cgroup_subsys_state *css; - struct pids_cgroup *pids; + struct pids_cgroup *pids, *pids_over_limit; int err; - if (cset) - css = cset->subsys[pids_cgrp_id]; - else - css = task_css_check(current, pids_cgrp_id, true); - pids = css_pids(css); - err = pids_try_charge(pids, 1); - if (err) { - /* Only log the first time events_limit is incremented. */ - if (atomic64_inc_return(&pids->events_limit) == 1) { - pr_info("cgroup: fork rejected by pids controller in "); - pr_cont_cgroup_path(css->cgroup); - pr_cont("\n"); - } - cgroup_file_notify(&pids->events_file); - } + pids = css_pids(cset->subsys[pids_cgrp_id]); + err = pids_try_charge(pids, 1, &pids_over_limit); + if (err) + pids_event(pids, pids_over_limit); + return err; } static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) { - struct cgroup_subsys_state *css; struct pids_cgroup *pids; - if (cset) - css = cset->subsys[pids_cgrp_id]; - else - css = task_css_check(current, pids_cgrp_id, true); - pids = css_pids(css); + pids = css_pids(cset->subsys[pids_cgrp_id]); pids_uncharge(pids, 1); } @@ -311,11 +350,40 @@ static s64 pids_current_read(struct cgroup_subsys_state *css, return atomic64_read(&pids->counter); } -static int pids_events_show(struct seq_file *sf, void *v) +static s64 pids_peak_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct pids_cgroup *pids = css_pids(css); + + return READ_ONCE(pids->watermark); +} + +static int __pids_events_show(struct seq_file *sf, bool local) { struct pids_cgroup *pids = css_pids(seq_css(sf)); + enum pidcg_event pe = PIDCG_MAX; + atomic64_t *events; + + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) { + pe = PIDCG_FORKFAIL; + local = true; + } + events = local ? pids->events_local : pids->events; - seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); + seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe])); + return 0; +} + +static int pids_events_show(struct seq_file *sf, void *v) +{ + __pids_events_show(sf, false); + return 0; +} + +static int pids_events_local_show(struct seq_file *sf, void *v) +{ + __pids_events_show(sf, true); return 0; } @@ -332,14 +400,52 @@ static struct cftype pids_files[] = { .flags = CFTYPE_NOT_ON_ROOT, }, { + .name = "peak", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = pids_peak_read, + }, + { .name = "events", .seq_show = pids_events_show, .file_offset = offsetof(struct pids_cgroup, events_file), .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "events.local", + .seq_show = pids_events_local_show, + .file_offset = offsetof(struct pids_cgroup, events_local_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, { } /* terminate */ }; +static struct cftype pids_files_legacy[] = { + { + .name = "max", + .write = pids_max_write, + .seq_show = pids_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .read_s64 = pids_current_read, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "peak", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = pids_peak_read, + }, + { + .name = "events", + .seq_show = pids_events_show, + .file_offset = offsetof(struct pids_cgroup, events_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* terminate */ +}; + + struct cgroup_subsys pids_cgrp_subsys = { .css_alloc = pids_css_alloc, .css_free = pids_css_free, @@ -348,7 +454,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, .release = pids_release, - .legacy_cftypes = pids_files, + .legacy_cftypes = pids_files_legacy, .dfl_cftypes = pids_files, .threaded = true, }; diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 3135406608c7..ef5878fb2005 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -197,6 +197,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg, /** * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count + * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup * stop uncharging @@ -221,6 +222,7 @@ static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, /** * rdmacg_uncharge - hierarchically uncharge rdma resource count + * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @index: index of the resource to uncharge in cgroup in given resource pool */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 9d331ba44870..a198e40c799b 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -3,48 +3,132 @@ #include <linux/sched/cputime.h> -static DEFINE_SPINLOCK(cgroup_rstat_lock); -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> + +#include <trace/events/cgroup.h> + +static DEFINE_SPINLOCK(rstat_base_lock); +static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); -static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) +/* + * Determines whether a given css can participate in rstat. + * css's that are cgroup::self use rstat for base stats. + * Other css's associated with a subsystem use rstat only when + * they define the ss->css_rstat_flush callback. + */ +static inline bool css_uses_rstat(struct cgroup_subsys_state *css) +{ + return css_is_self(css) || css->ss->css_rstat_flush != NULL; +} + +static struct css_rstat_cpu *css_rstat_cpu( + struct cgroup_subsys_state *css, int cpu) +{ + return per_cpu_ptr(css->rstat_cpu, cpu); +} + +static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( + struct cgroup *cgrp, int cpu) { - return per_cpu_ptr(cgrp->rstat_cpu, cpu); + return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); +} + +static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) +{ + if (ss) + return &ss->rstat_ss_lock; + + return &rstat_base_lock; +} + +static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) +{ + if (ss) + return per_cpu_ptr(ss->lhead, cpu); + return per_cpu_ptr(&rstat_backlog_list, cpu); } /** - * cgroup_rstat_updated - keep track of updated rstat_cpu - * @cgrp: target cgroup + * css_rstat_updated - keep track of updated rstat_cpu + * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * - * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching - * rstat_cpu->updated_children list. See the comment on top of - * cgroup_rstat_cpu definition for details. + * Atomically inserts the css in the ss's llist for the given cpu. This is + * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist + * will be processed at the flush time to create the update tree. + * + * NOTE: if the user needs the guarantee that the updater either add itself in + * the lockless list or the concurrent flusher flushes its updated stats, a + * memory barrier is needed before the call to css_rstat_updated() i.e. a + * barrier after updating the per-cpu stats and before calling + * css_rstat_updated(). */ -void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) +__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - unsigned long flags; + struct llist_head *lhead; + struct css_rstat_cpu *rstatc; + struct css_rstat_cpu __percpu *rstatc_pcpu; + struct llist_node *self; /* - * Speculative already-on-list test. This may race leading to - * temporary inaccuracies, which is fine. + * Since bpf programs can call this function, prevent access to + * uninitialized rstat pointers. + */ + if (!css_uses_rstat(css)) + return; + + lockdep_assert_preemption_disabled(); + + /* + * For archs withnot nmi safe cmpxchg or percpu ops support, ignore + * the requests from nmi context. + */ + if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || + !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) + return; + + rstatc = css_rstat_cpu(css, cpu); + /* + * If already on list return. This check is racy and smp_mb() is needed + * to pair it with the smp_mb() in css_process_update_tree() if the + * guarantee that the updated stats are visible to concurrent flusher is + * needed. + */ + if (llist_on_list(&rstatc->lnode)) + return; + + /* + * This function can be renentered by irqs and nmis for the same cgroup + * and may try to insert the same per-cpu lnode into the llist. Note + * that llist_add() does not protect against such scenarios. * - * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @cgrp is on the list by - * testing the next pointer for NULL. + * To protect against such stacked contexts of irqs/nmis, we use the + * fact that lnode points to itself when not on a list and then use + * this_cpu_cmpxchg() to atomically set to NULL to select the winner + * which will call llist_add(). The losers can assume the insertion is + * successful and the winner will eventually add the per-cpu lnode to + * the llist. */ - if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) + self = &rstatc->lnode; + rstatc_pcpu = css->rstat_cpu; + if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) return; - raw_spin_lock_irqsave(cpu_lock, flags); + lhead = ss_lhead_cpu(css->ss, cpu); + llist_add(&rstatc->lnode, lhead); +} - /* put @cgrp and all ancestors on the corresponding updated lists */ +static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) +{ + /* put @css and all ancestors on the corresponding updated lists */ while (true) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_rstat_cpu *prstatc; + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); + struct cgroup_subsys_state *parent = css->parent; + struct css_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup @@ -55,82 +139,169 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) /* Root has no parent to link it to, but mark it busy */ if (!parent) { - rstatc->updated_next = cgrp; + rstatc->updated_next = css; break; } - prstatc = cgroup_rstat_cpu(parent, cpu); + prstatc = css_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; - prstatc->updated_children = cgrp; + prstatc->updated_children = css; - cgrp = parent; + css = parent; } +} + +static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) +{ + struct llist_head *lhead = ss_lhead_cpu(ss, cpu); + struct llist_node *lnode; - raw_spin_unlock_irqrestore(cpu_lock, flags); + while ((lnode = llist_del_first_init(lhead))) { + struct css_rstat_cpu *rstatc; + + /* + * smp_mb() is needed here (more specifically in between + * init_llist_node() and per-cpu stats flushing) if the + * guarantee is required by a rstat user where etiher the + * updater should add itself on the lockless list or the + * flusher flush the stats updated by the updater who have + * observed that they are already on the list. The + * corresponding barrier pair for this one should be before + * css_rstat_updated() by the user. + * + * For now, there aren't any such user, so not adding the + * barrier here but if such a use-case arise, please add + * smp_mb() here. + */ + + rstatc = container_of(lnode, struct css_rstat_cpu, lnode); + __css_process_update_tree(rstatc->owner, cpu); + } } /** - * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree - * @pos: current position - * @root: root of the tree to traversal + * css_rstat_push_children - push children css's into the given list + * @head: current head of the list (= subtree root) + * @child: first child of the root * @cpu: target cpu + * Return: A new singly linked list of css's to be flushed * - * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts - * the traversal and %NULL return indicates the end. During traversal, - * each returned cgroup is unlinked from the tree. Must be called with the - * matching cgroup_rstat_cpu_lock held. - * - * The only ordering guarantee is that, for a parent and a child pair - * covered by a given traversal, if a child is visited, its parent is - * guaranteed to be visited afterwards. + * Iteratively traverse down the css_rstat_cpu updated tree level by + * level and push all the parents first before their next level children + * into a singly linked list via the rstat_flush_next pointer built from the + * tail backward like "pushing" css's into a stack. The root is pushed by + * the caller. */ -static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, - struct cgroup *root, int cpu) +static struct cgroup_subsys_state *css_rstat_push_children( + struct cgroup_subsys_state *head, + struct cgroup_subsys_state *child, int cpu) { - struct cgroup_rstat_cpu *rstatc; - struct cgroup *parent; + struct cgroup_subsys_state *cnext = child; /* Next head of child css level */ + struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */ + struct cgroup_subsys_state *parent, *grandchild; + struct css_rstat_cpu *crstatc; - if (pos == root) - return NULL; + child->rstat_flush_next = NULL; /* - * We're gonna walk down to the first leaf and visit/remove it. We - * can pick whatever unvisited node as the starting point. + * The subsystem rstat lock must be held for the whole duration from + * here as the rstat_flush_next list is being constructed to when + * it is consumed later in css_rstat_flush(). */ - if (!pos) { - pos = root; - /* return NULL if this subtree is not on-list */ - if (!cgroup_rstat_cpu(pos, cpu)->updated_next) - return NULL; - } else { - pos = cgroup_parent(pos); + lockdep_assert_held(ss_rstat_lock(head->ss)); + + /* + * Notation: -> updated_next pointer + * => rstat_flush_next pointer + * + * Assuming the following sample updated_children lists: + * P: C1 -> C2 -> P + * C1: G11 -> G12 -> C1 + * C2: G21 -> G22 -> C2 + * + * After 1st iteration: + * head => C2 => C1 => NULL + * ghead => G21 => G11 => NULL + * + * After 2nd iteration: + * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL + */ +next_level: + while (cnext) { + child = cnext; + cnext = child->rstat_flush_next; + parent = child->parent; + + /* updated_next is parent cgroup terminated if !NULL */ + while (child != parent) { + child->rstat_flush_next = head; + head = child; + crstatc = css_rstat_cpu(child, cpu); + grandchild = crstatc->updated_children; + if (grandchild != child) { + /* Push the grand child to the next level */ + crstatc->updated_children = child; + grandchild->rstat_flush_next = ghead; + ghead = grandchild; + } + child = crstatc->updated_next; + crstatc->updated_next = NULL; + } } - /* walk down to the first leaf */ - while (true) { - rstatc = cgroup_rstat_cpu(pos, cpu); - if (rstatc->updated_children == pos) - break; - pos = rstatc->updated_children; + if (ghead) { + cnext = ghead; + ghead = NULL; + goto next_level; } + return head; +} + +/** + * css_rstat_updated_list - build a list of updated css's to be flushed + * @root: root of the css subtree to traverse + * @cpu: target cpu + * Return: A singly linked list of css's to be flushed + * + * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, + * each returned css is unlinked from the updated tree. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, the child is before its parent in + * the list. + * + * Note that updated_children is self terminated and points to a list of + * child css's if not empty. Whereas updated_next is like a sibling link + * within the children list and terminated by the parent css. An exception + * here is the css root whose updated_next can be self terminated. + */ +static struct cgroup_subsys_state *css_rstat_updated_list( + struct cgroup_subsys_state *root, int cpu) +{ + struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); + struct cgroup_subsys_state *head = NULL, *parent, *child; + + css_process_update_tree(root->ss, cpu); + + /* Return NULL if this subtree is not on-list */ + if (!rstatc->updated_next) + return NULL; /* - * Unlink @pos from the tree. As the updated_children list is + * Unlink @root from its parent. As the updated_children list is * singly linked, we have to walk it to find the removal point. - * However, due to the way we traverse, @pos will be the first - * child in most cases. The only exception is @root. */ - parent = cgroup_parent(pos); + parent = root->parent; if (parent) { - struct cgroup_rstat_cpu *prstatc; - struct cgroup **nextp; + struct css_rstat_cpu *prstatc; + struct cgroup_subsys_state **nextp; - prstatc = cgroup_rstat_cpu(parent, cpu); + prstatc = css_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; - while (*nextp != pos) { - struct cgroup_rstat_cpu *nrstatc; + while (*nextp != root) { + struct css_rstat_cpu *nrstatc; - nrstatc = cgroup_rstat_cpu(*nextp, cpu); + nrstatc = css_rstat_cpu(*nextp, cpu); WARN_ON_ONCE(*nextp == parent); nextp = &nrstatc->updated_next; } @@ -138,157 +309,224 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, } rstatc->updated_next = NULL; - return pos; -} -/* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) - __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) -{ - int cpu; - - lockdep_assert_held(&cgroup_rstat_lock); + /* Push @root to the list first before pushing the children */ + head = root; + root->rstat_flush_next = NULL; + child = rstatc->updated_children; + rstatc->updated_children = root; + if (child != root) + head = css_rstat_push_children(head, child, cpu); - for_each_possible_cpu(cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, - cpu); - struct cgroup *pos = NULL; + return head; +} - raw_spin_lock(cpu_lock); - while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { - struct cgroup_subsys_state *css; +/* + * A hook for bpf stat collectors to attach to and flush their stats. + * Together with providing bpf kfuncs for css_rstat_updated() and + * css_rstat_flush(), this enables a complete workflow where bpf progs that + * collect cgroup stats can integrate with rstat for efficient flushing. + * + * A static noinline declaration here could cause the compiler to optimize away + * the function. A global noinline declaration will keep the definition, but may + * optimize away the callsite. Therefore, __weak is needed to ensure that the + * call is still emitted, by telling the compiler that we don't know what the + * function might eventually be. + */ - cgroup_base_stat_flush(pos, cpu); +__bpf_hook_start(); - rcu_read_lock(); - list_for_each_entry_rcu(css, &pos->rstat_css_list, - rstat_css_node) - css->ss->css_rstat_flush(css, cpu); - rcu_read_unlock(); - } - raw_spin_unlock(cpu_lock); - - /* if @may_sleep, play nice and yield if necessary */ - if (may_sleep && (need_resched() || - spin_needbreak(&cgroup_rstat_lock))) { - spin_unlock_irq(&cgroup_rstat_lock); - if (!cond_resched()) - cpu_relax(); - spin_lock_irq(&cgroup_rstat_lock); - } - } +__weak noinline void bpf_rstat_flush(struct cgroup *cgrp, + struct cgroup *parent, int cpu) +{ } -/** - * cgroup_rstat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup - * - * Collect all per-cpu stats in @cgrp's subtree into the global counters - * and propagate them upwards. After this function returns, all cgroups in - * the subtree have up-to-date ->stat. - * - * This also gets all cgroups in the subtree including @cgrp off the - * ->updated_children lists. +__bpf_hook_end(); + +/* + * Helper functions for locking. * - * This function may block. + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @cpu_in_loop indicate lock + * was released and re-taken when collection data from the CPUs. The + * value -1 is used when obtaining the main lock else this is the CPU + * number processed last. */ -void cgroup_rstat_flush(struct cgroup *cgrp) +static inline void __css_rstat_lock(struct cgroup_subsys_state *css, + int cpu_in_loop) + __acquires(ss_rstat_lock(css->ss)) { - might_sleep(); - - spin_lock_irq(&cgroup_rstat_lock); - cgroup_rstat_flush_locked(cgrp, true); - spin_unlock_irq(&cgroup_rstat_lock); + struct cgroup *cgrp = css->cgroup; + spinlock_t *lock; + bool contended; + + lock = ss_rstat_lock(css->ss); + contended = !spin_trylock_irq(lock); + if (contended) { + trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); + spin_lock_irq(lock); + } + trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); } -/** - * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() - * @cgrp: target cgroup - * - * This function can be called from any context. - */ -void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) +static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, + int cpu_in_loop) + __releases(ss_rstat_lock(css->ss)) { - unsigned long flags; + struct cgroup *cgrp = css->cgroup; + spinlock_t *lock; - spin_lock_irqsave(&cgroup_rstat_lock, flags); - cgroup_rstat_flush_locked(cgrp, false); - spin_unlock_irqrestore(&cgroup_rstat_lock, flags); + lock = ss_rstat_lock(css->ss); + trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); + spin_unlock_irq(lock); } /** - * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold - * @cgrp: target cgroup + * css_rstat_flush - flush stats in @css's rstat subtree + * @css: target cgroup subsystem state * - * Flush stats in @cgrp's subtree and prevent further flushes. Must be - * paired with cgroup_rstat_flush_release(). + * Collect all per-cpu stats in @css's subtree into the global counters + * and propagate them upwards. After this function returns, all rstat + * nodes in the subtree have up-to-date ->stat. + * + * This also gets all rstat nodes in the subtree including @css off the + * ->updated_children lists. * * This function may block. */ -void cgroup_rstat_flush_hold(struct cgroup *cgrp) - __acquires(&cgroup_rstat_lock) +__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) { - might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); - cgroup_rstat_flush_locked(cgrp, true); -} + int cpu; + bool is_self = css_is_self(css); -/** - * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() - */ -void cgroup_rstat_flush_release(void) - __releases(&cgroup_rstat_lock) -{ - spin_unlock_irq(&cgroup_rstat_lock); + /* + * Since bpf programs can call this function, prevent access to + * uninitialized rstat pointers. + */ + if (!css_uses_rstat(css)) + return; + + might_sleep(); + for_each_possible_cpu(cpu) { + struct cgroup_subsys_state *pos; + + /* Reacquire for each CPU to avoid disabling IRQs too long */ + __css_rstat_lock(css, cpu); + pos = css_rstat_updated_list(css, cpu); + for (; pos; pos = pos->rstat_flush_next) { + if (is_self) { + cgroup_base_stat_flush(pos->cgroup, cpu); + bpf_rstat_flush(pos->cgroup, + cgroup_parent(pos->cgroup), cpu); + } else + pos->ss->css_rstat_flush(pos, cpu); + } + __css_rstat_unlock(css, cpu); + if (!cond_resched()) + cpu_relax(); + } } -int cgroup_rstat_init(struct cgroup *cgrp) +int css_rstat_init(struct cgroup_subsys_state *css) { + struct cgroup *cgrp = css->cgroup; int cpu; + bool is_self = css_is_self(css); + + if (is_self) { + /* the root cgrp has rstat_base_cpu preallocated */ + if (!cgrp->rstat_base_cpu) { + cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); + if (!cgrp->rstat_base_cpu) + return -ENOMEM; + } + } else if (css->ss->css_rstat_flush == NULL) + return 0; + + /* the root cgrp's self css has rstat_cpu preallocated */ + if (!css->rstat_cpu) { + css->rstat_cpu = alloc_percpu(struct css_rstat_cpu); + if (!css->rstat_cpu) { + if (is_self) + free_percpu(cgrp->rstat_base_cpu); - /* the root cgrp has rstat_cpu preallocated */ - if (!cgrp->rstat_cpu) { - cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); - if (!cgrp->rstat_cpu) return -ENOMEM; + } } /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); + + rstatc->owner = rstatc->updated_children = css; + init_llist_node(&rstatc->lnode); + + if (is_self) { + struct cgroup_rstat_base_cpu *rstatbc; - rstatc->updated_children = cgrp; - u64_stats_init(&rstatc->bsync); + rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); + u64_stats_init(&rstatbc->bsync); + } } return 0; } -void cgroup_rstat_exit(struct cgroup *cgrp) +void css_rstat_exit(struct cgroup_subsys_state *css) { int cpu; - cgroup_rstat_flush(cgrp); + if (!css_uses_rstat(css)) + return; + + if (!css->rstat_cpu) + return; + + css_rstat_flush(css); /* sanity check */ for_each_possible_cpu(cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); - if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || + if (WARN_ON_ONCE(rstatc->updated_children != css) || WARN_ON_ONCE(rstatc->updated_next)) return; } - free_percpu(cgrp->rstat_cpu); - cgrp->rstat_cpu = NULL; + if (css_is_self(css)) { + struct cgroup *cgrp = css->cgroup; + + free_percpu(cgrp->rstat_base_cpu); + cgrp->rstat_base_cpu = NULL; + } + + free_percpu(css->rstat_cpu); + css->rstat_cpu = NULL; } -void __init cgroup_rstat_boot(void) +/** + * ss_rstat_init - subsystem-specific rstat initialization + * @ss: target subsystem + * + * If @ss is NULL, the static locks associated with the base stats + * are initialized. If @ss is non-NULL, the subsystem-specific locks + * are initialized. + */ +int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; + if (ss) { + ss->lhead = alloc_percpu(struct llist_head); + if (!ss->lhead) + return -ENOMEM; + } + + spin_lock_init(ss_rstat_lock(ss)); for_each_possible_cpu(cpu) - raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); + init_llist_head(ss_lhead_cpu(ss, cpu)); + + return 0; } /* @@ -301,6 +539,10 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; +#ifdef CONFIG_SCHED_CORE + dst_bstat->forceidle_sum += src_bstat->forceidle_sum; +#endif + dst_bstat->ntime += src_bstat->ntime; } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, @@ -309,13 +551,18 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, dst_bstat->cputime.utime -= src_bstat->cputime.utime; dst_bstat->cputime.stime -= src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; +#ifdef CONFIG_SCHED_CORE + dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; +#endif + dst_bstat->ntime -= src_bstat->ntime; } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_base_stat cur, delta; + struct cgroup_rstat_base_cpu *prstatbc; + struct cgroup_base_stat delta; unsigned seq; /* Root-level stats are sourced from system-wide CPU stats */ @@ -324,77 +571,90 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) /* fetch the current per-cpu values */ do { - seq = __u64_stats_fetch_begin(&rstatc->bsync); - cur.cputime = rstatc->bstat.cputime; - } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); + seq = __u64_stats_fetch_begin(&rstatbc->bsync); + delta = rstatbc->bstat; + } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); - /* propagate percpu delta to global */ - delta = cur; - cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + /* propagate per-cpu delta to cgroup and per-cpu global statistics */ + cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); - cgroup_base_stat_add(&rstatc->last_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_bstat, &delta); + cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); - /* propagate global delta to parent (unless that's root) */ + /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); + + delta = rstatbc->subtree_bstat; + prstatbc = cgroup_rstat_base_cpu(parent, cpu); + cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); + cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); } } -static struct cgroup_rstat_cpu * +static struct cgroup_rstat_base_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; - rstatc = get_cpu_ptr(cgrp->rstat_cpu); - *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); - return rstatc; + rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); + *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); + return rstatbc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, - struct cgroup_rstat_cpu *rstatc, + struct cgroup_rstat_base_cpu *rstatbc, unsigned long flags) { - u64_stats_update_end_irqrestore(&rstatc->bsync, flags); - cgroup_rstat_updated(cgrp, smp_processor_id()); - put_cpu_ptr(rstatc); + u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); + css_rstat_updated(&cgrp->self, smp_processor_id()); + put_cpu_ptr(rstatbc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); - rstatc->bstat.cputime.sum_exec_runtime += delta_exec; - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { - case CPUTIME_USER: case CPUTIME_NICE: - rstatc->bstat.cputime.utime += delta_exec; + rstatbc->bstat.ntime += delta_exec; + fallthrough; + case CPUTIME_USER: + rstatbc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: - rstatc->bstat.cputime.stime += delta_exec; + rstatbc->bstat.cputime.stime += delta_exec; + break; +#ifdef CONFIG_SCHED_CORE + case CPUTIME_FORCEIDLE: + rstatbc->bstat.forceidle_sum += delta_exec; break; +#endif default: break; } - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } /* @@ -403,13 +663,12 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, * with how it is done by __cgroup_account_cputime_field for each bit of * cpu time attributed to a cgroup. */ -static void root_cgroup_cputime(struct task_cputime *cputime) +static void root_cgroup_cputime(struct cgroup_base_stat *bstat) { + struct task_cputime *cputime = &bstat->cputime; int i; - cputime->stime = 0; - cputime->utime = 0; - cputime->sum_exec_runtime = 0; + memset(bstat, 0, sizeof(*bstat)); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; @@ -429,35 +688,72 @@ static void root_cgroup_cputime(struct task_cputime *cputime) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; - cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; + +#ifdef CONFIG_SCHED_CORE + bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; +#endif + bstat->ntime += cpustat[CPUTIME_NICE]; } } + +static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat) +{ +#ifdef CONFIG_SCHED_CORE + u64 forceidle_time = bstat->forceidle_sum; + + do_div(forceidle_time, NSEC_PER_USEC); + seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); +#endif +} + void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime; - struct task_cputime cputime; + struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { - cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; + css_rstat_flush(&cgrp->self); + __css_rstat_lock(&cgrp->self, -1); + bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, - &utime, &stime); - cgroup_rstat_flush_release(); + &bstat.cputime.utime, &bstat.cputime.stime); + __css_rstat_unlock(&cgrp->self, -1); } else { - root_cgroup_cputime(&cputime); - usage = cputime.sum_exec_runtime; - utime = cputime.utime; - stime = cputime.stime; + root_cgroup_cputime(&bstat); } - do_div(usage, NSEC_PER_USEC); - do_div(utime, NSEC_PER_USEC); - do_div(stime, NSEC_PER_USEC); + do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); + do_div(bstat.cputime.utime, NSEC_PER_USEC); + do_div(bstat.cputime.stime, NSEC_PER_USEC); + do_div(bstat.ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" - "user_usec %llu\n" - "system_usec %llu\n", - usage, utime, stime); + "user_usec %llu\n" + "system_usec %llu\n" + "nice_usec %llu\n", + bstat.cputime.sum_exec_runtime, + bstat.cputime.utime, + bstat.cputime.stime, + bstat.ntime); + + cgroup_force_idle_show(seq, &bstat); +} + +/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ +BTF_KFUNCS_START(bpf_rstat_kfunc_ids) +BTF_ID_FLAGS(func, css_rstat_updated) +BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) +BTF_KFUNCS_END(bpf_rstat_kfunc_ids) + +static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_rstat_kfunc_ids, +}; + +static int __init bpf_rstat_kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_rstat_kfunc_set); } +late_initcall(bpf_rstat_kfunc_init); |
