diff options
Diffstat (limited to 'kernel/cgroup')
-rw-r--r-- | kernel/cgroup/Makefile | 2 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-internal.h | 2 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 18 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 196 | ||||
-rw-r--r-- | kernel/cgroup/cpuset-internal.h | 305 | ||||
-rw-r--r-- | kernel/cgroup/cpuset-v1.c | 562 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 1600 | ||||
-rw-r--r-- | kernel/cgroup/dmem.c | 829 | ||||
-rw-r--r-- | kernel/cgroup/freezer.c | 97 | ||||
-rw-r--r-- | kernel/cgroup/legacy_freezer.c | 5 | ||||
-rw-r--r-- | kernel/cgroup/misc.c | 80 | ||||
-rw-r--r-- | kernel/cgroup/pids.c | 141 | ||||
-rw-r--r-- | kernel/cgroup/rstat.c | 179 |
13 files changed, 2608 insertions, 1408 deletions
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 12f8457ad1f9..ede31601a363 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -5,5 +5,7 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o obj-$(CONFIG_CGROUP_MISC) += misc.o +obj-$(CONFIG_CGROUP_DMEM) += dmem.o obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 520b90dd97ec..c964dd7ff967 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -81,6 +81,8 @@ struct cgroup_file_ctx { struct { struct cgroup_pidlist *pidlist; } procs1; + + struct cgroup_of_peak peak; }; /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 520a11cb12f4..e28d5f0d20ed 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -46,6 +46,12 @@ bool cgroup1_ssid_disabled(int ssid) return cgroup_no_v1_mask & (1 << ssid); } +static bool cgroup1_subsys_absent(struct cgroup_subsys *ss) +{ + /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */ + return ss->legacy_cftypes == NULL && ss->dfl_cftypes; +} + /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task @@ -675,11 +681,14 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) * cgroup_mutex contention. */ - for_each_subsys(ss, i) + for_each_subsys(ss, i) { + if (cgroup1_subsys_absent(ss)) + continue; seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); + } return 0; } @@ -932,7 +941,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) if (ret != -ENOPARAM) return ret; for_each_subsys(ss, i) { - if (strcmp(param->key, ss->legacy_name)) + if (strcmp(param->key, ss->legacy_name) || + cgroup1_subsys_absent(ss)) continue; if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) return invalfc(fc, "Disabled controller '%s'", @@ -1024,7 +1034,8 @@ static int check_cgroupfs_options(struct fs_context *fc) mask = ~((u16)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && + !cgroup1_subsys_absent(ss)) enabled |= 1 << i; ctx->subsys_mask &= enabled; @@ -1335,6 +1346,7 @@ static int __init cgroup_no_v1(char *str) continue; cgroup_no_v1_mask |= 1 << i; + break; } } return 1; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a66c088c851c..afc665b7b1fe 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1744,8 +1744,11 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (cgroup_psi_enabled()) { ret = cgroup_addrm_files(css, cgrp, cgroup_psi_files, true); - if (ret < 0) + if (ret < 0) { + cgroup_addrm_files(css, cgrp, + cgroup_base_files, false); return ret; + } } } else { ret = cgroup_addrm_files(css, cgrp, @@ -1839,9 +1842,9 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; - css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); + css->cgroup = dcgrp; WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], e_cset_node[ss->id]) { @@ -1922,6 +1925,7 @@ enum cgroup2_param { Opt_memory_localevents, Opt_memory_recursiveprot, Opt_memory_hugetlb_accounting, + Opt_pids_localevents, nr__cgroup2_params }; @@ -1931,6 +1935,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), + fsparam_flag("pids_localevents", Opt_pids_localevents), {} }; @@ -1960,10 +1965,20 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_memory_hugetlb_accounting: ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; return 0; + case Opt_pids_localevents: + ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + return 0; } return -EINVAL; } +struct cgroup_of_peak *of_peak(struct kernfs_open_file *of) +{ + struct cgroup_file_ctx *ctx = of->priv; + + return &ctx->peak; +} + static void apply_cgroup_root_flags(unsigned int root_flags) { if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { @@ -1989,6 +2004,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + + if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS; } } @@ -2004,6 +2024,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root seq_puts(seq, ",memory_recursiveprot"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) seq_puts(seq, ",memory_hugetlb_accounting"); + if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + seq_puts(seq, ",pids_localevents"); return 0; } @@ -2118,8 +2140,10 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto exit_stats; - ret = cgroup_bpf_inherit(root_cgrp); - WARN_ON_ONCE(ret); + if (root == &cgrp_dfl_root) { + ret = cgroup_bpf_inherit(root_cgrp); + WARN_ON_ONCE(ret); + } trace_cgroup_setup_root(root); @@ -2292,10 +2316,8 @@ static void cgroup_kill_sb(struct super_block *sb) * And don't kill the default root. */ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && - !percpu_ref_is_dying(&root->cgrp.self.refcnt)) { - cgroup_bpf_offline(&root->cgrp); + !percpu_ref_is_dying(&root->cgrp.self.refcnt)) percpu_ref_kill(&root->cgrp.self.refcnt); - } cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } @@ -2316,7 +2338,7 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, @@ -3654,12 +3676,40 @@ static int cgroup_events_show(struct seq_file *seq, void *v) static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; + struct cgroup_subsys_state *css; + int dying_cnt[CGROUP_SUBSYS_COUNT]; + int ssid; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); + + /* + * Show the number of live and dying csses associated with each of + * non-inhibited cgroup subsystems that is bound to cgroup v2. + * + * Without proper lock protection, racing is possible. So the + * numbers may not be consistent when that happens. + */ + rcu_read_lock(); + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + dying_cnt[ssid] = -1; + if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) || + (cgroup_subsys[ssid]->root != &cgrp_dfl_root)) + continue; + css = rcu_dereference_raw(cgroup->subsys[ssid]); + dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid]; + seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name, + css ? (css->nr_descendants + 1) : 0); + } + seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); - + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + if (dying_cnt[ssid] >= 0) + seq_printf(seq, "nr_dying_subsys_%s %d\n", + cgroup_subsys[ssid]->name, dying_cnt[ssid]); + } + rcu_read_unlock(); return 0; } @@ -3963,7 +4013,7 @@ static void __cgroup_kill(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - set_bit(CGRP_KILL, &cgrp->flags); + cgrp->kill_seq++; spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); @@ -3979,10 +4029,6 @@ static void __cgroup_kill(struct cgroup *cgrp) send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); - - spin_lock_irq(&css_set_lock); - clear_bit(CGRP_KILL, &cgrp->flags); - spin_unlock_irq(&css_set_lock); } static void cgroup_kill(struct cgroup *cgrp) @@ -4081,7 +4127,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - - * cgroup.procs and cgroup.subtree_control. + * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && @@ -4580,8 +4626,9 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct next descendant as long - * as both @pos and @root are accessible and @pos is a descendant of @root. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct next descendant as long as both @pos + * and @root are accessible and @pos is a descendant of @root. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the @@ -4629,8 +4676,9 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre); * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct rightmost descendant as - * long as @pos is accessible. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct rightmost descendant as long as @pos + * is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) @@ -4674,9 +4722,9 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct next descendant as long - * as both @pos and @cgroup are accessible and @pos is a descendant of - * @cgroup. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the @@ -5368,7 +5416,8 @@ static void css_free_rwork_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup1_pidlist_destroy_all(cgrp); + if (!cgroup_on_dfl(cgrp)) + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); bpf_cgrp_storage_free(cgrp); @@ -5408,6 +5457,8 @@ static void css_release_work_fn(struct work_struct *work) list_del_rcu(&css->sibling); if (ss) { + struct cgroup *parent_cgrp; + /* css release path */ if (!list_empty(&css->rstat_css_node)) { cgroup_rstat_flush(cgrp); @@ -5417,6 +5468,21 @@ static void css_release_work_fn(struct work_struct *work) cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); + + cgrp->nr_dying_subsys[ss->id]--; + /* + * When a css is released and ready to be freed, its + * nr_descendants must be zero. However, the corresponding + * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem + * is activated and deactivated multiple times with one or + * more of its previous activation leaving behind dying csses. + */ + WARN_ON_ONCE(css->nr_descendants); + parent_cgrp = cgroup_parent(cgrp); + while (parent_cgrp) { + parent_cgrp->nr_dying_subsys[ss->id]--; + parent_cgrp = cgroup_parent(parent_cgrp); + } } else { struct cgroup *tcgrp; @@ -5501,8 +5567,11 @@ static int online_css(struct cgroup_subsys_state *css) rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); - if (css->parent) + if (css->parent) { atomic_inc(&css->parent->online_cnt); + while ((css = css->parent)) + css->nr_descendants++; + } } return ret; } @@ -5524,6 +5593,16 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); + + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } } /** @@ -5627,9 +5706,11 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_kernfs_remove; - ret = cgroup_bpf_inherit(cgrp); - if (ret) - goto out_psi_free; + if (cgrp->root == &cgrp_dfl_root) { + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_psi_free; + } /* * New cgroup inherits effective freeze counter, and @@ -5706,7 +5787,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) { struct cgroup *cgroup; int ret = false; - int level = 1; + int level = 0; lockdep_assert_held(&cgroup_mutex); @@ -5714,7 +5795,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) if (cgroup->nr_descendants >= cgroup->max_descendants) goto fail; - if (level > cgroup->max_depth) + if (level >= cgroup->max_depth) goto fail; level++; @@ -5943,7 +6024,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup1_check_for_release(parent); - cgroup_bpf_offline(cgrp); + if (cgrp->root == &cgrp_dfl_root) + cgroup_bpf_offline(cgrp); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -6162,7 +6244,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 WARN_ON(register_filesystem(&cpuset_fs_type)); #endif @@ -6393,7 +6475,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) struct cgroup *dst_cgrp = NULL; struct css_set *cset; struct super_block *sb; - struct file *f; if (kargs->flags & CLONE_INTO_CGROUP) cgroup_lock(); @@ -6403,6 +6484,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); + if (kargs->cgrp) + kargs->kill_seq = kargs->cgrp->kill_seq; + else + kargs->kill_seq = cset->dfl_cgrp->kill_seq; spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { @@ -6410,14 +6495,14 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) return 0; } - f = fget_raw(kargs->cgroup); - if (!f) { + CLASS(fd_raw, f)(kargs->cgroup); + if (fd_empty(f)) { ret = -EBADF; goto err; } - sb = f->f_path.dentry->d_sb; + sb = fd_file(f)->f_path.dentry->d_sb; - dst_cgrp = cgroup_get_from_file(f); + dst_cgrp = cgroup_get_from_file(fd_file(f)); if (IS_ERR(dst_cgrp)) { ret = PTR_ERR(dst_cgrp); dst_cgrp = NULL; @@ -6465,15 +6550,12 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) } put_css_set(cset); - fput(f); kargs->cgrp = dst_cgrp; return ret; err: cgroup_threadgroup_change_end(current); cgroup_unlock(); - if (f) - fput(f); if (dst_cgrp) cgroup_put(dst_cgrp); put_css_set(cset); @@ -6586,6 +6668,7 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned int cgrp_kill_seq = 0; unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; @@ -6599,10 +6682,13 @@ void cgroup_post_fork(struct task_struct *child, /* init tasks are special, only link regular threads */ if (likely(child->pid)) { - if (kargs->cgrp) + if (kargs->cgrp) { cgrp_flags = kargs->cgrp->flags; - else + cgrp_kill_seq = kargs->cgrp->kill_seq; + } else { cgrp_flags = cset->dfl_cgrp->flags; + cgrp_kill_seq = cset->dfl_cgrp->kill_seq; + } WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; @@ -6637,7 +6723,7 @@ void cgroup_post_fork(struct task_struct *child, * child down right after we finished preparing it for * userspace. */ - kill = test_bit(CGRP_KILL, &cgrp_flags); + kill = kargs->kill_seq != cgrp_kill_seq; } spin_unlock_irq(&css_set_lock); @@ -6685,8 +6771,10 @@ void cgroup_exit(struct task_struct *tsk) WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; + /* matches the signal->live check in css_task_iter_advance() */ + if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) + list_add_tail(&tsk->cg_list, &cset->dying_tasks); if (dl_task(tsk)) dec_dl_tasks_cs(tsk); @@ -6713,10 +6801,12 @@ void cgroup_release(struct task_struct *task) ss->release(task); } while_each_subsys_mask(); - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); + if (!list_empty(&task->cg_list)) { + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); + } } void cgroup_free(struct task_struct *task) @@ -6879,14 +6969,11 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path); */ struct cgroup *cgroup_v1v2_get_from_fd(int fd) { - struct cgroup *cgrp; - struct fd f = fdget_raw(fd); - if (!f.file) + CLASS(fd_raw, f)(fd); + if (fd_empty(f)) return ERR_PTR(-EBADF); - cgrp = cgroup_v1v2_get_from_file(f.file); - fdput(f); - return cgrp; + return cgroup_v1v2_get_from_file(fd_file(f)); } /** @@ -7061,7 +7148,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, "favordynmods\n" "memory_localevents\n" "memory_recursiveprot\n" - "memory_hugetlb_accounting\n"); + "memory_hugetlb_accounting\n" + "pids_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h new file mode 100644 index 000000000000..976a8bc3ff60 --- /dev/null +++ b/kernel/cgroup/cpuset-internal.h @@ -0,0 +1,305 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __CPUSET_INTERNAL_H +#define __CPUSET_INTERNAL_H + +#include <linux/cgroup.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpuset.h> +#include <linux/spinlock.h> +#include <linux/union_find.h> + +/* See "Frequency meter" comments, below. */ + +struct fmeter { + int cnt; /* unprocessed events count */ + int val; /* most recent output value */ + time64_t time; /* clock (secs) when val computed */ + spinlock_t lock; /* guards read or write of above */ +}; + +/* + * Invalid partition error code + */ +enum prs_errcode { + PERR_NONE = 0, + PERR_INVCPUS, + PERR_INVPARENT, + PERR_NOTPART, + PERR_NOTEXCL, + PERR_NOCPUS, + PERR_HOTPLUG, + PERR_CPUSEMPTY, + PERR_HKEEPING, + PERR_ACCESS, +}; + +/* bits in struct cpuset flags field */ +typedef enum { + CS_ONLINE, + CS_CPU_EXCLUSIVE, + CS_MEM_EXCLUSIVE, + CS_MEM_HARDWALL, + CS_MEMORY_MIGRATE, + CS_SCHED_LOAD_BALANCE, + CS_SPREAD_PAGE, + CS_SPREAD_SLAB, +} cpuset_flagbits_t; + +/* The various types of files and directories in a cpuset file system */ + +typedef enum { + FILE_MEMORY_MIGRATE, + FILE_CPULIST, + FILE_MEMLIST, + FILE_EFFECTIVE_CPULIST, + FILE_EFFECTIVE_MEMLIST, + FILE_SUBPARTS_CPULIST, + FILE_EXCLUSIVE_CPULIST, + FILE_EFFECTIVE_XCPULIST, + FILE_ISOLATED_CPULIST, + FILE_CPU_EXCLUSIVE, + FILE_MEM_EXCLUSIVE, + FILE_MEM_HARDWALL, + FILE_SCHED_LOAD_BALANCE, + FILE_PARTITION_ROOT, + FILE_SCHED_RELAX_DOMAIN_LEVEL, + FILE_MEMORY_PRESSURE_ENABLED, + FILE_MEMORY_PRESSURE, + FILE_SPREAD_PAGE, + FILE_SPREAD_SLAB, +} cpuset_filetype_t; + +struct cpuset { + struct cgroup_subsys_state css; + + unsigned long flags; /* "unsigned long" so bitops work */ + + /* + * On default hierarchy: + * + * The user-configured masks can only be changed by writing to + * cpuset.cpus and cpuset.mems, and won't be limited by the + * parent masks. + * + * The effective masks is the real masks that apply to the tasks + * in the cpuset. They may be changed if the configured masks are + * changed or hotplug happens. + * + * effective_mask == configured_mask & parent's effective_mask, + * and if it ends up empty, it will inherit the parent's mask. + * + * + * On legacy hierarchy: + * + * The user-configured masks are always the same with effective masks. + */ + + /* user-configured CPUs and Memory Nodes allow to tasks */ + cpumask_var_t cpus_allowed; + nodemask_t mems_allowed; + + /* effective CPUs and Memory Nodes allow to tasks */ + cpumask_var_t effective_cpus; + nodemask_t effective_mems; + + /* + * Exclusive CPUs dedicated to current cgroup (default hierarchy only) + * + * The effective_cpus of a valid partition root comes solely from its + * effective_xcpus and some of the effective_xcpus may be distributed + * to sub-partitions below & hence excluded from its effective_cpus. + * For a valid partition root, its effective_cpus have no relationship + * with cpus_allowed unless its exclusive_cpus isn't set. + * + * This value will only be set if either exclusive_cpus is set or + * when this cpuset becomes a local partition root. + */ + cpumask_var_t effective_xcpus; + + /* + * Exclusive CPUs as requested by the user (default hierarchy only) + * + * Its value is independent of cpus_allowed and designates the set of + * CPUs that can be granted to the current cpuset or its children when + * it becomes a valid partition root. The effective set of exclusive + * CPUs granted (effective_xcpus) depends on whether those exclusive + * CPUs are passed down by its ancestors and not yet taken up by + * another sibling partition root along the way. + * + * If its value isn't set, it defaults to cpus_allowed. + */ + cpumask_var_t exclusive_cpus; + + /* + * This is old Memory Nodes tasks took on. + * + * - top_cpuset.old_mems_allowed is initialized to mems_allowed. + * - A new cpuset's old_mems_allowed is initialized when some + * task is moved into it. + * - old_mems_allowed is used in cpuset_migrate_mm() when we change + * cpuset.mems_allowed and have tasks' nodemask updated, and + * then old_mems_allowed is updated to mems_allowed. + */ + nodemask_t old_mems_allowed; + + struct fmeter fmeter; /* memory_pressure filter */ + + /* + * Tasks are being attached to this cpuset. Used to prevent + * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). + */ + int attach_in_progress; + + /* for custom sched domain */ + int relax_domain_level; + + /* number of valid local child partitions */ + int nr_subparts; + + /* partition root state */ + int partition_root_state; + + /* + * number of SCHED_DEADLINE tasks attached to this cpuset, so that we + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; + int nr_migrate_dl_tasks; + u64 sum_migrate_dl_bw; + + /* Invalid partition error code, not lock protected */ + enum prs_errcode prs_err; + + /* Handle for cpuset.cpus.partition */ + struct cgroup_file partition_file; + + /* Remote partition silbling list anchored at remote_children */ + struct list_head remote_sibling; + + /* Used to merge intersecting subsets for generate_sched_domains */ + struct uf_node node; +}; + +static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuset, css) : NULL; +} + +/* Retrieve the cpuset for a task */ +static inline struct cpuset *task_cs(struct task_struct *task) +{ + return css_cs(task_css(task, cpuset_cgrp_id)); +} + +static inline struct cpuset *parent_cs(struct cpuset *cs) +{ + return css_cs(cs->css.parent); +} + +/* convenient tests for these bits */ +static inline bool is_cpuset_online(struct cpuset *cs) +{ + return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); +} + +static inline int is_cpu_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_hardwall(const struct cpuset *cs) +{ + return test_bit(CS_MEM_HARDWALL, &cs->flags); +} + +static inline int is_sched_load_balance(const struct cpuset *cs) +{ + return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); +} + +static inline int is_memory_migrate(const struct cpuset *cs) +{ + return test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + +static inline int is_spread_page(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_PAGE, &cs->flags); +} + +static inline int is_spread_slab(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_SLAB, &cs->flags); +} + +/** + * cpuset_for_each_child - traverse online children of a cpuset + * @child_cs: loop cursor pointing to the current child + * @pos_css: used for iteration + * @parent_cs: target cpuset to walk children of + * + * Walk @child_cs through the online children of @parent_cs. Must be used + * with RCU read locked. + */ +#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ + css_for_each_child((pos_css), &(parent_cs)->css) \ + if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) + +/** + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants + * @des_cs: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @root_cs: target cpuset to walk ancestor of + * + * Walk @des_cs through the online descendants of @root_cs. Must be used + * with RCU read locked. The caller may modify @pos_css by calling + * css_rightmost_descendant() to skip subtree. @root_cs is included in the + * iteration and the first node to be visited. + */ +#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ + css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) + +void rebuild_sched_domains_locked(void); +void cpuset_callback_lock_irq(void); +void cpuset_callback_unlock_irq(void); +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus); +void cpuset_update_tasks_nodemask(struct cpuset *cs); +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int cpuset_common_seq_show(struct seq_file *sf, void *v); + +/* + * cpuset-v1.c + */ +#ifdef CONFIG_CPUSETS_V1 +extern struct cftype cpuset1_files[]; +void fmeter_init(struct fmeter *fmp); +void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk); +void cpuset1_update_tasks_flags(struct cpuset *cs); +void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated); +int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +#else +static inline void fmeter_init(struct fmeter *fmp) {} +static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk) {} +static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} +static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) {} +static inline int cpuset1_validate_change(struct cpuset *cur, + struct cpuset *trial) { return 0; } +#endif /* CONFIG_CPUSETS_V1 */ + +#endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c new file mode 100644 index 000000000000..25c1d7b77e2f --- /dev/null +++ b/kernel/cgroup/cpuset-v1.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "cpuset-internal.h" + +/* + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously + */ +struct cpuset_remove_tasks_struct { + struct work_struct work; + struct cpuset *cs; +}; + +/* + * Frequency meter - How fast is some event occurring? + * + * These routines manage a digitally filtered, constant time based, + * event frequency meter. There are four routines: + * fmeter_init() - initialize a frequency meter. + * fmeter_markevent() - called each time the event happens. + * fmeter_getrate() - returns the recent rate of such events. + * fmeter_update() - internal routine used to update fmeter. + * + * A common data structure is passed to each of these routines, + * which is used to keep track of the state required to manage the + * frequency meter and its digital filter. + * + * The filter works on the number of events marked per unit time. + * The filter is single-pole low-pass recursive (IIR). The time unit + * is 1 second. Arithmetic is done using 32-bit integers scaled to + * simulate 3 decimal digits of precision (multiplied by 1000). + * + * With an FM_COEF of 933, and a time base of 1 second, the filter + * has a half-life of 10 seconds, meaning that if the events quit + * happening, then the rate returned from the fmeter_getrate() + * will be cut in half each 10 seconds, until it converges to zero. + * + * It is not worth doing a real infinitely recursive filter. If more + * than FM_MAXTICKS ticks have elapsed since the last filter event, + * just compute FM_MAXTICKS ticks worth, by which point the level + * will be stable. + * + * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid + * arithmetic overflow in the fmeter_update() routine. + * + * Given the simple 32 bit integer arithmetic used, this meter works + * best for reporting rates between one per millisecond (msec) and + * one per 32 (approx) seconds. At constant rates faster than one + * per msec it maxes out at values just under 1,000,000. At constant + * rates between one per msec, and one per second it will stabilize + * to a value N*1000, where N is the rate of events per second. + * At constant rates between one per second and one per 32 seconds, + * it will be choppy, moving up on the seconds that have an event, + * and then decaying until the next event. At rates slower than + * about one in 32 seconds, it decays all the way back to zero between + * each event. + */ + +#define FM_COEF 933 /* coefficient for half-life of 10 secs */ +#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ +#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ +#define FM_SCALE 1000 /* faux fixed point scale */ + +/* Initialize a frequency meter */ +void fmeter_init(struct fmeter *fmp) +{ + fmp->cnt = 0; + fmp->val = 0; + fmp->time = 0; + spin_lock_init(&fmp->lock); +} + +/* Internal meter update - process cnt events and update value */ +static void fmeter_update(struct fmeter *fmp) +{ + time64_t now; + u32 ticks; + + now = ktime_get_seconds(); + ticks = now - fmp->time; + + if (ticks == 0) + return; + + ticks = min(FM_MAXTICKS, ticks); + while (ticks-- > 0) + fmp->val = (FM_COEF * fmp->val) / FM_SCALE; + fmp->time = now; + + fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; + fmp->cnt = 0; +} + +/* Process any previous ticks, then bump cnt by one (times scale). */ +static void fmeter_markevent(struct fmeter *fmp) +{ + spin_lock(&fmp->lock); + fmeter_update(fmp); + fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); + spin_unlock(&fmp->lock); +} + +/* Process any previous ticks, then return current value. */ +static int fmeter_getrate(struct fmeter *fmp) +{ + int val; + + spin_lock(&fmp->lock); + fmeter_update(fmp); + val = fmp->val; + spin_unlock(&fmp->lock); + return val; +} + +/* + * Collection of memory_pressure is suppressed unless + * this flag is enabled by writing "1" to the special + * cpuset file 'memory_pressure_enabled' in the root cpuset. + */ + +int cpuset_memory_pressure_enabled __read_mostly; + +/* + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. + * + * Keep a running average of the rate of synchronous (direct) + * page reclaim efforts initiated by tasks in each cpuset. + * + * This represents the rate at which some task in the cpuset + * ran low on memory on all nodes it was allowed to use, and + * had to enter the kernels page reclaim code in an effort to + * create more free memory by tossing clean pages or swapping + * or writing dirty pages. + * + * Display to user space in the per-cpuset read-only file + * "memory_pressure". Value displayed is an integer + * representing the recent rate of entry into the synchronous + * (direct) page reclaim by any task attached to the cpuset. + */ + +void __cpuset_memory_pressure_bump(void) +{ + rcu_read_lock(); + fmeter_markevent(&task_cs(current)->fmeter); + rcu_read_unlock(); +} + +static int update_relax_domain_level(struct cpuset *cs, s64 val) +{ +#ifdef CONFIG_SMP + if (val < -1 || val > sched_domain_level_max + 1) + return -EINVAL; +#endif + + if (val != cs->relax_domain_level) { + cs->relax_domain_level = val; + if (!cpumask_empty(cs->cpus_allowed) && + is_sched_load_balance(cs)) + rebuild_sched_domains_locked(); + } + + return 0; +} + +static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + int retval = -ENODEV; + + cpus_read_lock(); + cpuset_lock(); + if (!is_cpuset_online(cs)) + goto out_unlock; + + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + retval = update_relax_domain_level(cs, val); + break; + default: + retval = -EINVAL; + break; + } +out_unlock: + cpuset_unlock(); + cpus_read_unlock(); + return retval; +} + +static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + return cs->relax_domain_level; + default: + BUG(); + } + + /* Unreachable but makes gcc happy */ + return 0; +} + +/* + * update task's spread flag if cpuset's page/slab spread flag is set + * + * Call with callback_lock or cpuset_mutex held. The check can be skipped + * if on default hierarchy. + */ +void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk) +{ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + return; + + if (is_spread_page(cs)) + task_set_spread_page(tsk); + else + task_clear_spread_page(tsk); + + if (is_spread_slab(cs)) + task_set_spread_slab(tsk); + else + task_clear_spread_slab(tsk); +} + +/** + * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset. + * @cs: the cpuset in which each task's spread flags needs to be changed + * + * Iterate through each task of @cs updating its spread flags. As this + * function is called with cpuset_mutex held, cpuset membership stays + * stable. + */ +void cpuset1_update_tasks_flags(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + while ((task = css_task_iter_next(&it))) + cpuset1_update_task_spread_flags(cs, task); + css_task_iter_end(&it); +} + +/* + * If CPU and/or memory hotplug handlers, below, unplug any CPUs + * or memory nodes, we need to walk over the cpuset hierarchy, + * removing that CPU or node from all cpusets. If this removes the + * last CPU or node from a cpuset, then move the tasks in the empty + * cpuset to its next-highest non-empty parent. + */ +static void remove_tasks_in_empty_cpuset(struct cpuset *cs) +{ + struct cpuset *parent; + + /* + * Find its next-highest non-empty parent, (top cpuset + * has online cpus, so can't be empty). + */ + parent = parent_cs(cs); + while (cpumask_empty(parent->cpus_allowed) || + nodes_empty(parent->mems_allowed)) + parent = parent_cs(parent); + + if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { + pr_err("cpuset: failed to transfer tasks out of empty cpuset "); + pr_cont_cgroup_name(cs->css.cgroup); + pr_cont("\n"); + } +} + +static void cpuset_migrate_tasks_workfn(struct work_struct *work) +{ + struct cpuset_remove_tasks_struct *s; + + s = container_of(work, struct cpuset_remove_tasks_struct, work); + remove_tasks_in_empty_cpuset(s->cs); + css_put(&s->cs->css); + kfree(s); +} + +void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + bool is_empty; + + cpuset_callback_lock_irq(); + cpumask_copy(cs->cpus_allowed, new_cpus); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->mems_allowed = *new_mems; + cs->effective_mems = *new_mems; + cpuset_callback_unlock_irq(); + + /* + * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty, + * as the tasks will be migrated to an ancestor. + */ + if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) + cpuset_update_tasks_cpumask(cs, new_cpus); + if (mems_updated && !nodes_empty(cs->mems_allowed)) + cpuset_update_tasks_nodemask(cs); + + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + /* + * Move tasks to the nearest ancestor with execution resources, + * This is full cgroup operation which will also call back into + * cpuset. Execute it asynchronously using workqueue. + */ + if (is_empty && cs->css.cgroup->nr_populated_csets && + css_tryget_online(&cs->css)) { + struct cpuset_remove_tasks_struct *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (WARN_ON_ONCE(!s)) { + css_put(&cs->css); + return; + } + + s->cs = cs; + INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); + schedule_work(&s->work); + } +} + +/* + * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? + * + * One cpuset is a subset of another if all its allowed CPUs and + * Memory Nodes are a subset of the other, and its exclusive flags + * are only set if the other's are set. Call holding cpuset_mutex. + */ + +static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) +{ + return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + nodes_subset(p->mems_allowed, q->mems_allowed) && + is_cpu_exclusive(p) <= is_cpu_exclusive(q) && + is_mem_exclusive(p) <= is_mem_exclusive(q); +} + +/* + * cpuset1_validate_change() - Validate conditions specific to legacy (v1) + * behavior. + */ +int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + +static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + + switch (type) { + case FILE_CPU_EXCLUSIVE: + return is_cpu_exclusive(cs); + case FILE_MEM_EXCLUSIVE: + return is_mem_exclusive(cs); + case FILE_MEM_HARDWALL: + return is_mem_hardwall(cs); + case FILE_SCHED_LOAD_BALANCE: + return is_sched_load_balance(cs); + case FILE_MEMORY_MIGRATE: + return is_memory_migrate(cs); + case FILE_MEMORY_PRESSURE_ENABLED: + return cpuset_memory_pressure_enabled; + case FILE_MEMORY_PRESSURE: + return fmeter_getrate(&cs->fmeter); + case FILE_SPREAD_PAGE: + return is_spread_page(cs); + case FILE_SPREAD_SLAB: + return is_spread_slab(cs); + default: + BUG(); + } + + /* Unreachable but makes gcc happy */ + return 0; +} + +static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + int retval = 0; + + cpus_read_lock(); + cpuset_lock(); + if (!is_cpuset_online(cs)) { + retval = -ENODEV; + goto out_unlock; + } + + switch (type) { + case FILE_CPU_EXCLUSIVE: + retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); + break; + case FILE_MEM_EXCLUSIVE: + retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); + break; + case FILE_MEM_HARDWALL: + retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); + break; + case FILE_SCHED_LOAD_BALANCE: + retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); + break; + case FILE_MEMORY_MIGRATE: + retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); + break; + case FILE_MEMORY_PRESSURE_ENABLED: + cpuset_memory_pressure_enabled = !!val; + break; + case FILE_SPREAD_PAGE: + retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); + break; + case FILE_SPREAD_SLAB: + retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); + break; + default: + retval = -EINVAL; + break; + } +out_unlock: + cpuset_unlock(); + cpus_read_unlock(); + return retval; +} + +/* + * for the common functions, 'private' gives the type of file + */ + +struct cftype cpuset1_files[] = { + { + .name = "cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_CPULIST, + }, + + { + .name = "mems", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * MAX_NUMNODES), + .private = FILE_MEMLIST, + }, + + { + .name = "effective_cpus", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_CPULIST, + }, + + { + .name = "effective_mems", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_MEMLIST, + }, + + { + .name = "cpu_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_CPU_EXCLUSIVE, + }, + + { + .name = "mem_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_EXCLUSIVE, + }, + + { + .name = "mem_hardwall", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_HARDWALL, + }, + + { + .name = "sched_load_balance", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SCHED_LOAD_BALANCE, + }, + + { + .name = "sched_relax_domain_level", + .read_s64 = cpuset_read_s64, + .write_s64 = cpuset_write_s64, + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, + }, + + { + .name = "memory_migrate", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_MIGRATE, + }, + + { + .name = "memory_pressure", + .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, + }, + + { + .name = "memory_spread_page", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_PAGE, + }, + + { + /* obsolete, may be removed in the future */ + .name = "memory_spread_slab", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_SLAB, + }, + + { + .name = "memory_pressure_enabled", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_PRESSURE_ENABLED, + }, + + { } /* terminate */ +}; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 927bef3a598a..0f910c828973 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,11 +21,9 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ +#include "cgroup-internal.h" +#include "cpuset-internal.h" -#include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/cpuset.h> -#include <linux/delay.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> @@ -39,10 +37,8 @@ #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/security.h> -#include <linux/spinlock.h> #include <linux/oom.h> #include <linux/sched/isolation.h> -#include <linux/cgroup.h> #include <linux/wait.h> #include <linux/workqueue.h> @@ -56,30 +52,6 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); -/* See "Frequency meter" comments, below. */ - -struct fmeter { - int cnt; /* unprocessed events count */ - int val; /* most recent output value */ - time64_t time; /* clock (secs) when val computed */ - spinlock_t lock; /* guards read or write of above */ -}; - -/* - * Invalid partition error code - */ -enum prs_errcode { - PERR_NONE = 0, - PERR_INVCPUS, - PERR_INVPARENT, - PERR_NOTPART, - PERR_NOTEXCL, - PERR_NOCPUS, - PERR_HOTPLUG, - PERR_CPUSEMPTY, - PERR_HKEEPING, -}; - static const char * const perr_strings[] = { [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", [PERR_INVPARENT] = "Parent is an invalid partition root", @@ -87,118 +59,9 @@ static const char * const perr_strings[] = { [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", - [PERR_CPUSEMPTY] = "cpuset.cpus is empty", + [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", -}; - -struct cpuset { - struct cgroup_subsys_state css; - - unsigned long flags; /* "unsigned long" so bitops work */ - - /* - * On default hierarchy: - * - * The user-configured masks can only be changed by writing to - * cpuset.cpus and cpuset.mems, and won't be limited by the - * parent masks. - * - * The effective masks is the real masks that apply to the tasks - * in the cpuset. They may be changed if the configured masks are - * changed or hotplug happens. - * - * effective_mask == configured_mask & parent's effective_mask, - * and if it ends up empty, it will inherit the parent's mask. - * - * - * On legacy hierarchy: - * - * The user-configured masks are always the same with effective masks. - */ - - /* user-configured CPUs and Memory Nodes allow to tasks */ - cpumask_var_t cpus_allowed; - nodemask_t mems_allowed; - - /* effective CPUs and Memory Nodes allow to tasks */ - cpumask_var_t effective_cpus; - nodemask_t effective_mems; - - /* - * Exclusive CPUs dedicated to current cgroup (default hierarchy only) - * - * This exclusive CPUs must be a subset of cpus_allowed. A parent - * cgroup can only grant exclusive CPUs to one of its children. - * - * When the cgroup becomes a valid partition root, effective_xcpus - * defaults to cpus_allowed if not set. The effective_cpus of a valid - * partition root comes solely from its effective_xcpus and some of the - * effective_xcpus may be distributed to sub-partitions below & hence - * excluded from its effective_cpus. - */ - cpumask_var_t effective_xcpus; - - /* - * Exclusive CPUs as requested by the user (default hierarchy only) - */ - cpumask_var_t exclusive_cpus; - - /* - * This is old Memory Nodes tasks took on. - * - * - top_cpuset.old_mems_allowed is initialized to mems_allowed. - * - A new cpuset's old_mems_allowed is initialized when some - * task is moved into it. - * - old_mems_allowed is used in cpuset_migrate_mm() when we change - * cpuset.mems_allowed and have tasks' nodemask updated, and - * then old_mems_allowed is updated to mems_allowed. - */ - nodemask_t old_mems_allowed; - - struct fmeter fmeter; /* memory_pressure filter */ - - /* - * Tasks are being attached to this cpuset. Used to prevent - * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). - */ - int attach_in_progress; - - /* partition number for rebuild_sched_domains() */ - int pn; - - /* for custom sched domain */ - int relax_domain_level; - - /* number of valid sub-partitions */ - int nr_subparts; - - /* partition root state */ - int partition_root_state; - - /* - * Default hierarchy only: - * use_parent_ecpus - set if using parent's effective_cpus - * child_ecpus_count - # of children with use_parent_ecpus set - */ - int use_parent_ecpus; - int child_ecpus_count; - - /* - * number of SCHED_DEADLINE tasks attached to this cpuset, so that we - * know when to rebuild associated root domain bandwidth information. - */ - int nr_deadline_tasks; - int nr_migrate_dl_tasks; - u64 sum_migrate_dl_bw; - - /* Invalid partition error code, not lock protected */ - enum prs_errcode prs_err; - - /* Handle for cpuset.cpus.partition */ - struct cgroup_file partition_file; - - /* Remote partition silbling list anchored at remote_children */ - struct list_head remote_sibling; + [PERR_ACCESS] = "Enable partition not permitted", }; /* @@ -211,10 +74,33 @@ static cpumask_var_t subpartitions_cpus; */ static cpumask_var_t isolated_cpus; +/* + * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot + */ +static cpumask_var_t boot_hk_cpus; +static bool have_boot_isolcpus; + /* List of remote partition root children */ static struct list_head remote_children; /* + * A flag to force sched domain rebuild at the end of an operation. + * It can be set in + * - update_partition_sd_lb() + * - remote_partition_check() + * - update_cpumasks_hier() + * - cpuset_update_flag() + * - cpuset_hotplug_update_tasks() + * - cpuset_handle_hotplug() + * + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * + * Note that update_relax_domain_level() in cpuset-v1.c can still call + * rebuild_sched_domains_locked() directly without using this flag. + */ +static bool force_sd_rebuild; + +/* * Partition root states: * * 0 - member (not a partition root) @@ -222,6 +108,17 @@ static struct list_head remote_children; * 2 - partition root without load balancing (isolated) * -1 - invalid partition root * -2 - invalid isolated partition root + * + * There are 2 types of partitions - local or remote. Local partitions are + * those whose parents are partition root themselves. Setting of + * cpuset.cpus.exclusive are optional in setting up local partitions. + * Remote partitions are those whose parents are not partition roots. Passing + * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor + * nodes are mandatory in creating a remote partition. + * + * For simplicity, a local partition can be created under a local or remote + * partition but a remote partition cannot have any partition root in its + * ancestor chain except the cgroup root. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -243,22 +140,6 @@ struct tmpmasks { cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; -static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct cpuset, css) : NULL; -} - -/* Retrieve the cpuset for a task */ -static inline struct cpuset *task_cs(struct task_struct *task) -{ - return css_cs(task_css(task, cpuset_cgrp_id)); -} - -static inline struct cpuset *parent_cs(struct cpuset *cs) -{ - return css_cs(cs->css.parent); -} - void inc_dl_tasks_cs(struct task_struct *p) { struct cpuset *cs = task_cs(p); @@ -273,59 +154,6 @@ void dec_dl_tasks_cs(struct task_struct *p) cs->nr_deadline_tasks--; } -/* bits in struct cpuset flags field */ -typedef enum { - CS_ONLINE, - CS_CPU_EXCLUSIVE, - CS_MEM_EXCLUSIVE, - CS_MEM_HARDWALL, - CS_MEMORY_MIGRATE, - CS_SCHED_LOAD_BALANCE, - CS_SPREAD_PAGE, - CS_SPREAD_SLAB, -} cpuset_flagbits_t; - -/* convenient tests for these bits */ -static inline bool is_cpuset_online(struct cpuset *cs) -{ - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); -} - -static inline int is_cpu_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_hardwall(const struct cpuset *cs) -{ - return test_bit(CS_MEM_HARDWALL, &cs->flags); -} - -static inline int is_sched_load_balance(const struct cpuset *cs) -{ - return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); -} - -static inline int is_memory_migrate(const struct cpuset *cs) -{ - return test_bit(CS_MEMORY_MIGRATE, &cs->flags); -} - -static inline int is_spread_page(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_PAGE, &cs->flags); -} - -static inline int is_spread_slab(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_SLAB, &cs->flags); -} - static inline int is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; @@ -360,46 +188,17 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) } static struct cpuset top_cpuset = { - .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | - (1 << CS_MEM_EXCLUSIVE)), + .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, + .relax_domain_level = -1, .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; -/** - * cpuset_for_each_child - traverse online children of a cpuset - * @child_cs: loop cursor pointing to the current child - * @pos_css: used for iteration - * @parent_cs: target cpuset to walk children of - * - * Walk @child_cs through the online children of @parent_cs. Must be used - * with RCU read locked. - */ -#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ - css_for_each_child((pos_css), &(parent_cs)->css) \ - if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) - -/** - * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants - * @des_cs: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @root_cs: target cpuset to walk ancestor of - * - * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_css by calling - * css_rightmost_descendant() to skip subtree. @root_cs is included in the - * iteration and the first node to be visited. - */ -#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ - css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ - if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) - /* * There are two global locks guarding cpuset structures - cpuset_mutex and - * callback_lock. We also require taking task_lock() when dereferencing a - * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems - * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset + * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel + * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset * structures. Note that cpuset_mutex needs to be a mutex as it is used in * paths that rely on priority inheritance (e.g. scheduler - on RT) for * correctness. @@ -425,12 +224,9 @@ static struct cpuset top_cpuset = { * by other task, we use alloc_lock in the task_struct fields to protect * them. * - * The cpuset_common_file_read() handlers only hold callback_lock across + * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. - * - * Accessing a task's cpuset should be done in accordance with the - * guidelines for accessing subsystem state in kernel/cgroup.c */ static DEFINE_MUTEX(cpuset_mutex); @@ -447,13 +243,17 @@ void cpuset_unlock(void) static DEFINE_SPINLOCK(callback_lock); -static struct workqueue_struct *cpuset_migrate_mm_wq; +void cpuset_callback_lock_irq(void) +{ + spin_lock_irq(&callback_lock); +} -/* - * CPU / memory hotplug is handled asynchronously. - */ -static void cpuset_hotplug_workfn(struct work_struct *work); -static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); +void cpuset_callback_unlock_irq(void) +{ + spin_unlock_irq(&callback_lock); +} + +static struct workqueue_struct *cpuset_migrate_mm_wq; static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); @@ -469,6 +269,32 @@ static inline void check_insane_mems_config(nodemask_t *nodes) } /* + * decrease cs->attach_in_progress. + * wake_up cpuset_attach_wq if cs->attach_in_progress==0. + */ +static inline void dec_attach_in_progress_locked(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +} + +static inline void dec_attach_in_progress(struct cpuset *cs) +{ + mutex_lock(&cpuset_mutex); + dec_attach_in_progress_locked(cs); + mutex_unlock(&cpuset_mutex); +} + +static inline bool cpuset_v2(void) +{ + return !IS_ENABLED(CONFIG_CPUSETS_V1) || + cgroup_subsys_on_dfl(cpuset_cgrp_subsys); +} + +/* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. @@ -478,7 +304,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) */ static inline bool is_in_v2_mode(void) { - return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + return cpuset_v2() || (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } @@ -540,22 +366,10 @@ static void guarantee_online_cpus(struct task_struct *tsk, rcu_read_lock(); cs = task_cs(tsk); - while (!cpumask_intersects(cs->effective_cpus, pmask)) { + while (!cpumask_intersects(cs->effective_cpus, pmask)) cs = parent_cs(cs); - if (unlikely(!cs)) { - /* - * The top cpuset doesn't have any online cpu as a - * consequence of a race between cpuset_hotplug_work - * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to be - * identical to cpu_online_mask. - */ - goto out_unlock; - } - } - cpumask_and(pmask, pmask, cs->effective_cpus); -out_unlock: + cpumask_and(pmask, pmask, cs->effective_cpus); rcu_read_unlock(); } @@ -577,45 +391,6 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } -/* - * update task's spread flag if cpuset's page/slab spread flag is set - * - * Call with callback_lock or cpuset_mutex held. The check can be skipped - * if on default hierarchy. - */ -static void cpuset_update_task_spread_flags(struct cpuset *cs, - struct task_struct *tsk) -{ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) - return; - - if (is_spread_page(cs)) - task_set_spread_page(tsk); - else - task_clear_spread_page(tsk); - - if (is_spread_slab(cs)) - task_set_spread_slab(tsk); - else - task_clear_spread_slab(tsk); -} - -/* - * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? - * - * One cpuset is a subset of another if all its allowed CPUs and - * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_mutex. - */ - -static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) -{ - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && - nodes_subset(p->mems_allowed, q->mems_allowed) && - is_cpu_exclusive(p) <= is_cpu_exclusive(q) && - is_mem_exclusive(p) <= is_mem_exclusive(q); -} - /** * alloc_cpumasks - allocate three cpumasks for cpuset * @cs: the cpuset that have cpumasks to be allocated. @@ -718,11 +493,17 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } -static inline struct cpumask *fetch_xcpus(struct cpuset *cs) +/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) +{ + return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed + : cs->exclusive_cpus; +} + +static inline bool xcpus_empty(struct cpuset *cs) { - return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : - cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed - : cs->effective_xcpus; + return cpumask_empty(cs->cpus_allowed) && + cpumask_empty(cs->exclusive_cpus); } /* @@ -732,8 +513,8 @@ static inline struct cpumask *fetch_xcpus(struct cpuset *cs) */ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) { - struct cpumask *xcpus1 = fetch_xcpus(cs1); - struct cpumask *xcpus2 = fetch_xcpus(cs2); + struct cpumask *xcpus1 = user_xcpus(cs1); + struct cpumask *xcpus2 = user_xcpus(cs2); if (cpumask_intersects(xcpus1, xcpus2)) return false; @@ -741,35 +522,6 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) } /* - * validate_change_legacy() - Validate conditions specific to legacy (v1) - * behavior. - */ -static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) -{ - struct cgroup_subsys_state *css; - struct cpuset *c, *par; - int ret; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* Each of our child cpusets must be a subset of us */ - ret = -EBUSY; - cpuset_for_each_child(c, css, cur) - if (!is_cpuset_subset(c, trial)) - goto out; - - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - par = parent_cs(cur); - if (par && !is_cpuset_subset(trial, par)) - goto out; - - ret = 0; -out: - return ret; -} - -/* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * @@ -798,7 +550,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) rcu_read_lock(); if (!is_in_v2_mode()) - ret = validate_change_legacy(cur, trial); + ret = cpuset1_validate_change(cur, trial); if (ret) goto out; @@ -824,27 +576,63 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* * We can't shrink if we won't have enough room for SCHED_DEADLINE - * tasks. + * tasks. This check is not done when scheduling is disabled as the + * users should know what they are doing. + * + * For v1, effective_cpus == cpus_allowed & user_xcpus() returns + * cpus_allowed. + * + * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only + * for non-isolated partition root. At this point, the target + * effective_cpus isn't computed yet. user_xcpus() is the best + * approximation. + * + * TBD: May need to precompute the real effective_cpus here in case + * incorrect scheduling of SCHED_DEADLINE tasks in a partition + * becomes an issue. */ ret = -EBUSY; - if (is_cpu_exclusive(cur) && - !cpuset_cpumask_can_shrink(cur->cpus_allowed, - trial->cpus_allowed)) + if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) && + !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial))) goto out; /* * If either I or some sibling (!= me) is exclusive, we can't - * overlap + * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur) { + bool txset, cxset; /* Are exclusive_cpus set? */ + + if (c == cur) + continue; + + txset = !cpumask_empty(trial->exclusive_cpus); + cxset = !cpumask_empty(c->exclusive_cpus); + if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || + (txset && cxset)) { if (!cpusets_are_exclusive(trial, c)) goto out; + } else if (txset || cxset) { + struct cpumask *xcpus, *acpus; + + /* + * When just one of the exclusive_cpus's is set, + * cpus_allowed of the other cpuset, if set, cannot be + * a subset of it or none of those CPUs will be + * available if these exclusive CPUs are activated. + */ + if (txset) { + xcpus = trial->exclusive_cpus; + acpus = c->cpus_allowed; + } else { + xcpus = c->exclusive_cpus; + acpus = trial->cpus_allowed; + } + if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) + goto out; } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) goto out; } @@ -940,18 +728,15 @@ static inline int nr_cpusets(void) * were changed (added or removed.) * * Finding the best partition (set of domains): - * The triple nested loops below over i, j, k scan over the - * load balanced cpusets (using the array of cpuset pointers in - * csa[]) looking for pairs of cpusets that have overlapping - * cpus_allowed, but which don't have the same 'pn' partition - * number and gives them in the same partition number. It keeps - * looping on the 'restart' label until it can no longer find - * any such pairs. + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). * - * The union of the cpus_allowed masks from the set of - * all cpusets having the same 'pn' value then form the one - * element of the partition (one sched domain) to be passed to - * partition_sched_domains(). */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) @@ -959,20 +744,23 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ - int i, j, k; /* indices for partition finding loops */ + int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); + bool cgrpv2 = cpuset_v2(); + int nslot_update; doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts) { + if (root_load_balance && cpumask_empty(subpartitions_cpus)) { +single_root_domain: ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -1000,16 +788,18 @@ static int generate_sched_domains(cpumask_var_t **domains, cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; + + if (cgrpv2) + goto v2; + /* + * v1: * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. - * - * If root is load-balancing, we can skip @cp if it - * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && @@ -1017,47 +807,62 @@ static int generate_sched_domains(cpumask_var_t **domains, housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; - if (root_load_balance && - cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) - continue; - if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; - /* skip @cp's subtree if not a partition root */ - if (!is_partition_valid(cp)) + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + +v2: + /* + * Only valid partition roots that are not isolated and with + * non-empty effective_cpus will be saved into csn[]. + */ + if ((cp->partition_root_state == PRS_ROOT) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* + * Skip @cp's subtree if not a partition root and has no + * exclusive CPUs to be granted to child cpusets. + */ + if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); + /* + * If there are only isolated partitions underneath the cgroup root, + * we can optimize out unneeded sched domains scanning. + */ + if (root_load_balance && (csn == 1)) + goto single_root_domain; + for (i = 0; i < csn; i++) - csa[i]->pn = i; - ndoms = csn; + uf_node_init(&csa[i]->node); -restart: - /* Find the best partition (set of sched domains) */ + /* Merge overlapping cpusets */ for (i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - int apn = a->pn; - - for (j = 0; j < csn; j++) { - struct cpuset *b = csa[j]; - int bpn = b->pn; - - if (apn != bpn && cpusets_overlap(a, b)) { - for (k = 0; k < csn; k++) { - struct cpuset *c = csa[k]; - - if (c->pn == bpn) - c->pn = apn; - } - ndoms--; /* one less element */ - goto restart; + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) { + /* + * Cgroup v2 shouldn't pass down overlapping + * partition root cpusets. + */ + WARN_ON_ONCE(cgrpv2); + uf_union(&csa[i]->node, &csa[j]->node); } } } + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. @@ -1073,45 +878,48 @@ restart: dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL); - for (nslot = 0, i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - struct cpumask *dp; - int apn = a->pn; - - if (apn < 0) { - /* Skip completed partitions */ - continue; - } - - dp = doms[nslot]; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; + /* + * Cgroup v2 doesn't support domain attributes, just set all of them + * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a + * subset of HK_TYPE_DOMAIN housekeeping CPUs. + */ + if (cgrpv2) { + for (i = 0; i < ndoms; i++) { + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (csa[i] == &top_cpuset) + cpumask_and(doms[i], csa[i]->effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; } + goto done; + } - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; + for (nslot = 0, i = 0; i < csn; i++) { + nslot_update = 0; for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpumask_or(dp, dp, b->effective_cpus); + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; + + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) - update_domain_attr_tree(dattr + nslot, b); - - /* Done with this partition */ - b->pn = -1; + update_domain_attr_tree(dattr + nslot, csa[j]); } } - nslot++; + if (nslot_update) + nslot++; } BUG_ON(nslot != ndoms); @@ -1203,7 +1011,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * * Call with cpuset_mutex held. Takes cpus_read_lock(). */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; @@ -1213,11 +1021,12 @@ static void rebuild_sched_domains_locked(void) lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); + force_sd_rebuild = false; /* * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * Anyways, cpuset_handle_hotplug() will rebuild sched domains. * * With no CPUs in any subpartitions, top_cpuset's effective CPUs * should be the same as the active CPUs, so checking only top_cpuset @@ -1232,7 +1041,7 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts) { + if (!cpumask_empty(subpartitions_cpus)) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { @@ -1255,22 +1064,27 @@ static void rebuild_sched_domains_locked(void) partition_and_rebuild_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ -void rebuild_sched_domains(void) +static void rebuild_sched_domains_cpuslocked(void) { - cpus_read_lock(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); +} + +void rebuild_sched_domains(void) +{ + cpus_read_lock(); + rebuild_sched_domains_cpuslocked(); cpus_read_unlock(); } /** - * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. + * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @new_cpus: the temp variable for the new effective_cpus mask * @@ -1280,7 +1094,7 @@ void rebuild_sched_domains(void) * is used instead of effective_cpus to make sure all offline CPUs are also * included as hotplug code won't update cpumasks for tasks in top_cpuset. */ -static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; @@ -1330,8 +1144,6 @@ enum partition_cmd { partcmd_invalidate, /* Make partition invalid */ }; -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, - int turning_on); static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp); @@ -1342,14 +1154,14 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, */ static int update_partition_exclusive(struct cpuset *cs, int new_prs) { - bool exclusive = (new_prs > 0); + bool exclusive = (new_prs > PRS_MEMBER); if (exclusive && !is_cpu_exclusive(cs)) { - if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) + if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1)) return PERR_NOTEXCL; } else if (!exclusive && is_cpu_exclusive(cs)) { /* Turning off CS_CPU_EXCLUSIVE will not return error */ - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0); } return 0; } @@ -1385,7 +1197,7 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) } if (rebuild_domains) - rebuild_sched_domains_locked(); + cpuset_force_rebuild(); } /* @@ -1407,7 +1219,7 @@ static void reset_partition_data(struct cpuset *cs) { struct cpuset *parent = parent_cs(cs); - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (!cpuset_v2()) return; lockdep_assert_held(&callback_lock); @@ -1418,12 +1230,8 @@ static void reset_partition_data(struct cpuset *cs) if (is_cpu_exclusive(cs)) clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); } - if (!cpumask_and(cs->effective_cpus, - parent->effective_cpus, cs->cpus_allowed)) { - cs->use_parent_ecpus = true; - parent->child_ecpus_count++; + if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) cpumask_copy(cs->effective_cpus, parent->effective_cpus); - } } /* @@ -1536,7 +1344,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); * Return: true if xcpus is not empty, false otherwise. * * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), - * it must be a subset of cpus_allowed and parent's effective_xcpus. + * it must be a subset of parent's effective_xcpus. */ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, struct cpumask *xcpus) @@ -1546,12 +1354,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, if (!xcpus) xcpus = cs->effective_xcpus; - if (!cpumask_empty(cs->exclusive_cpus)) - cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); - else - cpumask_copy(xcpus, cs->cpus_allowed); - - return cpumask_and(xcpus, xcpus, parent->effective_xcpus); + return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); } static inline bool is_remote_partition(struct cpuset *cs) @@ -1568,8 +1371,8 @@ static inline bool is_local_partition(struct cpuset *cs) * remote_partition_enable - Enable current cpuset as a remote partition root * @cs: the cpuset to update * @new_prs: new partition_root_state - * @tmp: temparary masks - * Return: 1 if successful, 0 if error + * @tmp: temporary masks + * Return: 0 if successful, errcode if error * * Enable the current cpuset to become a remote partition root taking CPUs * directly from the top cpuset. cpuset_mutex must be held by the caller. @@ -1583,7 +1386,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * The user must have sysadmin privilege. */ if (!capable(CAP_SYS_ADMIN)) - return 0; + return PERR_ACCESS; /* * The requested exclusive_cpus must not be allocated to other @@ -1597,32 +1400,26 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, if (cpumask_empty(tmp->new_cpus) || cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) - return 0; + return PERR_INVCPUS; spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); - - cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); - return 1; + return 0; } /* * remote_partition_disable - Remove current cpuset from remote partition list * @cs: the cpuset to update - * @tmp: temparary masks + * @tmp: temporary masks * * The effective_cpus is also updated. * @@ -1648,9 +1445,9 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) update_unbound_workqueue_cpumask(isolcpus_updated); /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); } @@ -1658,7 +1455,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) * remote_cpus_update - cpus_exclusive change of remote partition * @cs: the cpuset to be updated * @newmask: the new effective_xcpus mask - * @tmp: temparary masks + * @tmp: temporary masks * * top_cpuset and subpartitions_cpus will be updated or partition can be * invalidated. @@ -1700,9 +1497,9 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, update_unbound_workqueue_cpumask(isolcpus_updated); /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return; @@ -1715,7 +1512,7 @@ invalidate: * @cs: the cpuset to be updated * @newmask: the new effective_xcpus mask * @delmask: temporary mask for deletion (not in tmp) - * @tmp: temparary masks + * @tmp: temporary masks * * This should be called before the given cs has updated its cpus_allowed * and/or effective_xcpus. @@ -1748,7 +1545,7 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, disable_cnt++; } if (disable_cnt) - rebuild_sched_domains_locked(); + cpuset_force_rebuild(); } /* @@ -1757,15 +1554,15 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, * @new_cpus: cpu mask * Return: true if there is conflict, false otherwise * - * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in - * an isolated partition. + * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * isolated partition. */ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) { - const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN); - bool all_in_hk = cpumask_subset(new_cpus, hk_domain); + if (!have_boot_isolcpus) + return false; - if (!all_in_hk && (prstate != PRS_ISOLATED)) + if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) return true; return false; @@ -1830,8 +1627,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; - xcpus = !cpumask_empty(cs->exclusive_cpus) - ? cs->effective_xcpus : cs->cpus_allowed; + xcpus = user_xcpus(cs); if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs)) @@ -1859,7 +1655,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } - if (!newmask && cpumask_empty(cs->cpus_allowed)) + if (!newmask && xcpus_empty(cs)) return PERR_CPUSEMPTY; nocpu = tasks_nocpu_error(parent, cs, xcpus); @@ -1906,6 +1702,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, part_error = PERR_CPUSEMPTY; goto write_error; } + /* Check newmask again, whether cpus are available for parent/cs */ + nocpu |= tasks_nocpu_error(parent, cs, newmask); /* * partcmd_update with newmask: @@ -2073,20 +1871,17 @@ write_error: update_partition_exclusive(cs, new_prs); if (adding || deleting) { - update_tasks_cpumask(parent, tmp->addmask); + cpuset_update_tasks_cpumask(parent, tmp->addmask); update_sibling_cpumasks(parent, cs, tmp); } /* * For partcmd_update without newmask, it is being called from - * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. - * Update the load balance flag and scheduling domain if - * cpus_read_trylock() is successful. + * cpuset_handle_hotplug(). Update the load balance flag and + * scheduling domain accordingly. */ - if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { + if ((cmd == partcmd_update) && !newmask) update_partition_sd_lb(cs, old_prs); - cpus_read_unlock(); - } notify_partition_change(cs, old_prs); return 0; @@ -2160,12 +1955,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, } /* - * update_cpumasks_hier() flags - */ -#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ -#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ - -/* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup @@ -2179,7 +1968,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, - int flags) + bool force) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -2234,17 +2023,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * it is a partition root that has explicitly distributed * out all its CPUs. */ - if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { + if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); - if (!cp->use_parent_ecpus) { - cp->use_parent_ecpus = true; - parent->child_ecpus_count++; - } - } else if (cp->use_parent_ecpus) { - cp->use_parent_ecpus = false; - WARN_ON_ONCE(!parent->child_ecpus_count); - parent->child_ecpus_count--; - } if (remote) goto get_css; @@ -2253,12 +2033,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * Skip the whole subtree if * 1) the cpumask remains the same, * 2) has no partition root state, - * 3) HIER_CHECKALL flag not set, and + * 3) force flag not set, and * 4) for v2 load balance state same as its parent. */ - if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && + if (!cp->partition_root_state && !force && cpumask_equal(tmp->new_cpus, cp->effective_cpus) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (!cpuset_v2() || (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -2268,7 +2048,7 @@ update_parent_effective: /* * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call - * update_tasks_cpumask() again for tasks in the parent + * cpuset_update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's effective_cpus changes. */ if ((cp != cs) && old_prs) { @@ -2325,15 +2105,14 @@ get_css: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp, cp->effective_cpus); + cpuset_update_tasks_cpumask(cp, cp->effective_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE * from parent if current cpuset isn't a valid partition root * and their load balance states differ. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_partition_valid(cp) && + if (cpuset_v2() && !is_partition_valid(cp) && (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { if (is_sched_load_balance(parent)) set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); @@ -2349,8 +2128,7 @@ get_css: */ if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - is_partition_valid(cp))) + (!cpuset_v2() || is_partition_valid(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); @@ -2358,8 +2136,8 @@ get_css: } rcu_read_unlock(); - if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) - rebuild_sched_domains_locked(); + if (need_rebuild_sched_domains) + cpuset_force_rebuild(); } /** @@ -2380,23 +2158,19 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * Check all its siblings and call update_cpumasks_hier() * if their effective_cpus will need to be changed. * - * With the addition of effective_xcpus which is a subset of - * cpus_allowed. It is possible a change in parent's effective_cpus + * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus * directly. * * The update_cpumasks_hier() function may sleep. So we have to - * release the RCU read lock before calling it. HIER_NO_SD_REBUILD - * flag is used to suppress rebuild of sched domains as the callers - * will take care of that. + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs) continue; - if (!sibling->use_parent_ecpus && - !is_partition_valid(sibling)) { + if (!is_partition_valid(sibling)) { compute_effective_cpumask(tmp->new_cpus, sibling, parent); if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) @@ -2406,7 +2180,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; rcu_read_unlock(); - update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); + update_cpumasks_hier(sibling, tmp, false); rcu_read_lock(); css_put(&sibling->css); } @@ -2426,7 +2200,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; - int hier_flags = 0; + bool force = false; int old_prs = cs->partition_root_state; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ @@ -2441,7 +2215,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ if (!*buf) { cpumask_clear(trialcs->cpus_allowed); - cpumask_clear(trialcs->effective_xcpus); + if (cpumask_empty(trialcs->exclusive_cpus)) + cpumask_clear(trialcs->effective_xcpus); } else { retval = cpulist_parse(buf, trialcs->cpus_allowed); if (retval < 0) @@ -2452,7 +2227,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return -EINVAL; /* - * When exclusive_cpus isn't explicitly set, it is constrainted + * When exclusive_cpus isn't explicitly set, it is constrained * by cpus_allowed and parent's effective_xcpus. Otherwise, * trialcs->effective_xcpus is used as a temporary cpumask * for checking validity of the partition root. @@ -2486,12 +2261,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ - if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) - hier_flags = HIER_CHECKALL; + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); - if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if ((retval == -EINVAL) && cpuset_v2()) { struct cgroup_subsys_state *css; struct cpuset *cp; @@ -2505,7 +2279,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, invalidate = true; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = fetch_xcpus(trialcs); + struct cpumask *xcpus = user_xcpus(trialcs); if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { @@ -2555,7 +2329,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_unlock_irq(&callback_lock); /* effective_cpus/effective_xcpus will be updated here */ - update_cpumasks_hier(cs, &tmp, hier_flags); + update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) @@ -2580,7 +2354,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; - int hier_flags = 0; + bool force = false; int old_prs = cs->partition_root_state; if (!*buf) { @@ -2590,8 +2364,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = cpulist_parse(buf, trialcs->exclusive_cpus); if (retval < 0) return retval; - if (!is_cpu_exclusive(cs)) - set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); } /* Nothing to do if the CPUs didn't change */ @@ -2605,8 +2377,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ - if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) - hier_flags = HIER_CHECKALL; + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); if (retval) @@ -2659,8 +2430,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, * of the subtree when it is a valid partition root or effective_xcpus * is updated. */ - if (is_partition_valid(cs) || hier_flags) - update_cpumasks_hier(cs, &tmp, hier_flags); + if (is_partition_valid(cs) || force) + update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) @@ -2754,14 +2525,14 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, static void *cpuset_being_rebound; /** - * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. + * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ -static void update_tasks_nodemask(struct cpuset *cs) +void cpuset_update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; @@ -2859,7 +2630,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); - update_tasks_nodemask(cp); + cpuset_update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); @@ -2945,44 +2716,8 @@ bool current_cpuset_is_being_rebound(void) return ret; } -static int update_relax_domain_level(struct cpuset *cs, s64 val) -{ -#ifdef CONFIG_SMP - if (val < -1 || val >= sched_domain_level_max) - return -EINVAL; -#endif - - if (val != cs->relax_domain_level) { - cs->relax_domain_level = val; - if (!cpumask_empty(cs->cpus_allowed) && - is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); - } - - return 0; -} - -/** - * update_tasks_flags - update the spread flags of tasks in the cpuset. - * @cs: the cpuset in which each task's spread flags needs to be changed - * - * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_mutex held, cpuset membership stays - * stable. - */ -static void update_tasks_flags(struct cpuset *cs) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) - cpuset_update_task_spread_flags(cs, task); - css_task_iter_end(&it); -} - /* - * update_flag - read a 0 or a 1 in a file and update associated flag + * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared @@ -2990,7 +2725,7 @@ static void update_tasks_flags(struct cpuset *cs) * Call with cpuset_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { struct cpuset *trialcs; @@ -3021,11 +2756,15 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); - if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { + if (cpuset_v2()) + cpuset_force_rebuild(); + else + rebuild_sched_domains_locked(); + } if (spread_flag_changed) - update_tasks_flags(cs); + cpuset1_update_tasks_flags(cs); out: free_cpuset(trialcs); return err; @@ -3074,24 +2813,26 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; if (!old_prs) { - enum partition_cmd cmd = (new_prs == PRS_ROOT) - ? partcmd_enable : partcmd_enablei; - /* - * cpus_allowed cannot be empty. + * cpus_allowed and exclusive_cpus cannot be both empty. */ - if (cpumask_empty(cs->cpus_allowed)) { + if (xcpus_empty(cs)) { err = PERR_CPUSEMPTY; goto out; } - err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); /* - * If an attempt to become local partition root fails, - * try to become a remote partition root instead. + * If parent is valid partition, enable local partiion. + * Otherwise, enable a remote partition. */ - if (err && remote_partition_enable(cs, new_prs, &tmpmask)) - err = 0; + if (is_partition_valid(parent)) { + enum partition_cmd cmd = (new_prs == PRS_ROOT) + ? partcmd_enable : partcmd_enablei; + + err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); + } else { + err = remote_partition_enable(cs, new_prs, &tmpmask); + } } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. @@ -3134,117 +2875,18 @@ out: update_unbound_workqueue_cpumask(new_xcpus_state); /* Force update if switching back to member */ - update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); + update_cpumasks_hier(cs, &tmpmask, !new_prs); /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); free_cpumasks(NULL, &tmpmask); return 0; } -/* - * Frequency meter - How fast is some event occurring? - * - * These routines manage a digitally filtered, constant time based, - * event frequency meter. There are four routines: - * fmeter_init() - initialize a frequency meter. - * fmeter_markevent() - called each time the event happens. - * fmeter_getrate() - returns the recent rate of such events. - * fmeter_update() - internal routine used to update fmeter. - * - * A common data structure is passed to each of these routines, - * which is used to keep track of the state required to manage the - * frequency meter and its digital filter. - * - * The filter works on the number of events marked per unit time. - * The filter is single-pole low-pass recursive (IIR). The time unit - * is 1 second. Arithmetic is done using 32-bit integers scaled to - * simulate 3 decimal digits of precision (multiplied by 1000). - * - * With an FM_COEF of 933, and a time base of 1 second, the filter - * has a half-life of 10 seconds, meaning that if the events quit - * happening, then the rate returned from the fmeter_getrate() - * will be cut in half each 10 seconds, until it converges to zero. - * - * It is not worth doing a real infinitely recursive filter. If more - * than FM_MAXTICKS ticks have elapsed since the last filter event, - * just compute FM_MAXTICKS ticks worth, by which point the level - * will be stable. - * - * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid - * arithmetic overflow in the fmeter_update() routine. - * - * Given the simple 32 bit integer arithmetic used, this meter works - * best for reporting rates between one per millisecond (msec) and - * one per 32 (approx) seconds. At constant rates faster than one - * per msec it maxes out at values just under 1,000,000. At constant - * rates between one per msec, and one per second it will stabilize - * to a value N*1000, where N is the rate of events per second. - * At constant rates between one per second and one per 32 seconds, - * it will be choppy, moving up on the seconds that have an event, - * and then decaying until the next event. At rates slower than - * about one in 32 seconds, it decays all the way back to zero between - * each event. - */ - -#define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ -#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ -#define FM_SCALE 1000 /* faux fixed point scale */ - -/* Initialize a frequency meter */ -static void fmeter_init(struct fmeter *fmp) -{ - fmp->cnt = 0; - fmp->val = 0; - fmp->time = 0; - spin_lock_init(&fmp->lock); -} - -/* Internal meter update - process cnt events and update value */ -static void fmeter_update(struct fmeter *fmp) -{ - time64_t now; - u32 ticks; - - now = ktime_get_seconds(); - ticks = now - fmp->time; - - if (ticks == 0) - return; - - ticks = min(FM_MAXTICKS, ticks); - while (ticks-- > 0) - fmp->val = (FM_COEF * fmp->val) / FM_SCALE; - fmp->time = now; - - fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; - fmp->cnt = 0; -} - -/* Process any previous ticks, then bump cnt by one (times scale). */ -static void fmeter_markevent(struct fmeter *fmp) -{ - spin_lock(&fmp->lock); - fmeter_update(fmp); - fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); - spin_unlock(&fmp->lock); -} - -/* Process any previous ticks, then return current value. */ -static int fmeter_getrate(struct fmeter *fmp) -{ - int val; - - spin_lock(&fmp->lock); - fmeter_update(fmp); - val = fmp->val; - spin_unlock(&fmp->lock); - return val; -} - static struct cpuset *cpuset_attach_old_cs; /* @@ -3301,8 +2943,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * migration permission derives from hierarchy ownership in * cgroup_procs_write_permission()). */ - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - (cpus_updated || mems_updated)) { + if (!cpuset_v2() || (cpus_updated || mems_updated)) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; @@ -3353,9 +2994,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cs = css_cs(css); mutex_lock(&cpuset_mutex); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); + dec_attach_in_progress_locked(cs); if (cs->nr_migrate_dl_tasks) { int cpu = cpumask_any(cs->effective_cpus); @@ -3391,7 +3030,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flags(cs, task); + cpuset1_update_task_spread_flags(cs, task); } static void cpuset_attach(struct cgroup_taskset *tset) @@ -3418,8 +3057,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) * in effective cpus and mems. In that case, we can optimize out * by skipping the task iteration and update. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !cpus_updated && !mems_updated) { + if (cpuset_v2() && !cpus_updated && !mems_updated) { cpuset_attach_nodemask_to = cs->effective_mems; goto out; } @@ -3470,116 +3108,15 @@ out: reset_migrate_dl_data(cs); } - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - - mutex_unlock(&cpuset_mutex); -} - -/* The various types of files and directories in a cpuset file system */ - -typedef enum { - FILE_MEMORY_MIGRATE, - FILE_CPULIST, - FILE_MEMLIST, - FILE_EFFECTIVE_CPULIST, - FILE_EFFECTIVE_MEMLIST, - FILE_SUBPARTS_CPULIST, - FILE_EXCLUSIVE_CPULIST, - FILE_EFFECTIVE_XCPULIST, - FILE_ISOLATED_CPULIST, - FILE_CPU_EXCLUSIVE, - FILE_MEM_EXCLUSIVE, - FILE_MEM_HARDWALL, - FILE_SCHED_LOAD_BALANCE, - FILE_PARTITION_ROOT, - FILE_SCHED_RELAX_DOMAIN_LEVEL, - FILE_MEMORY_PRESSURE_ENABLED, - FILE_MEMORY_PRESSURE, - FILE_SPREAD_PAGE, - FILE_SPREAD_SLAB, -} cpuset_filetype_t; - -static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = 0; - - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) { - retval = -ENODEV; - goto out_unlock; - } + dec_attach_in_progress_locked(cs); - switch (type) { - case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); - break; - case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); - break; - case FILE_MEM_HARDWALL: - retval = update_flag(CS_MEM_HARDWALL, cs, val); - break; - case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); - break; - case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, val); - break; - case FILE_MEMORY_PRESSURE_ENABLED: - cpuset_memory_pressure_enabled = !!val; - break; - case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, val); - break; - case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - return retval; -} - -static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, - s64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = -ENODEV; - - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) - goto out_unlock; - - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - retval = update_relax_domain_level(cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - return retval; } /* * Common handling for a write to a "cpus" or "mems" file. */ -static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); @@ -3587,30 +3124,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, int retval = -ENODEV; buf = strstrip(buf); - - /* - * CPU or memory hotunplug may leave @cs w/o any execution - * resources, in which case the hotplug code asynchronously updates - * configuration and transfers all tasks to the nearest ancestor - * which can execute. - * - * As writes to "cpus" or "mems" may restore @cs's execution - * resources, wait for the previously scheduled operations before - * proceeding, so that we don't end up keep removing tasks added - * after execution capability is restored. - * - * cpuset_hotplug_work calls back into cgroup core via - * cgroup_transfer_tasks() and waiting for it from a cgroupfs - * operation like this one can lead to a deadlock through kernfs - * active_ref protection. Let's break the protection. Losing the - * protection is okay as we check whether @cs is online after - * grabbing cpuset_mutex anyway. This only happens on the legacy - * hierarchies. - */ - css_get(&cs->css); - kernfs_break_active_protection(of->kn); - flush_work(&cpuset_hotplug_work); - cpus_read_lock(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) @@ -3638,11 +3151,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, } free_cpuset(trialcs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); - kernfs_unbreak_active_protection(of->kn); - css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -3655,7 +3168,7 @@ out_unlock: * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. */ -static int cpuset_common_seq_show(struct seq_file *sf, void *v) +int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; @@ -3696,52 +3209,6 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) return ret; } -static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_CPU_EXCLUSIVE: - return is_cpu_exclusive(cs); - case FILE_MEM_EXCLUSIVE: - return is_mem_exclusive(cs); - case FILE_MEM_HARDWALL: - return is_mem_hardwall(cs); - case FILE_SCHED_LOAD_BALANCE: - return is_sched_load_balance(cs); - case FILE_MEMORY_MIGRATE: - return is_memory_migrate(cs); - case FILE_MEMORY_PRESSURE_ENABLED: - return cpuset_memory_pressure_enabled; - case FILE_MEMORY_PRESSURE: - return fmeter_getrate(&cs->fmeter); - case FILE_SPREAD_PAGE: - return is_spread_page(cs); - case FILE_SPREAD_SLAB: - return is_spread_slab(cs); - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - -static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - return cs->relax_domain_level; - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - static int sched_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); @@ -3782,9 +3249,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, buf = strstrip(buf); - /* - * Convert "root" to ENABLED, and convert "member" to DISABLED. - */ if (!strcmp(buf, "root")) val = PRS_ROOT; else if (!strcmp(buf, "member")) @@ -3809,112 +3273,6 @@ out_unlock: } /* - * for the common functions, 'private' gives the type of file - */ - -static struct cftype legacy_files[] = { - { - .name = "cpus", - .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * NR_CPUS), - .private = FILE_CPULIST, - }, - - { - .name = "mems", - .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * MAX_NUMNODES), - .private = FILE_MEMLIST, - }, - - { - .name = "effective_cpus", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_CPULIST, - }, - - { - .name = "effective_mems", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_MEMLIST, - }, - - { - .name = "cpu_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_CPU_EXCLUSIVE, - }, - - { - .name = "mem_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_EXCLUSIVE, - }, - - { - .name = "mem_hardwall", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_HARDWALL, - }, - - { - .name = "sched_load_balance", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SCHED_LOAD_BALANCE, - }, - - { - .name = "sched_relax_domain_level", - .read_s64 = cpuset_read_s64, - .write_s64 = cpuset_write_s64, - .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, - }, - - { - .name = "memory_migrate", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_MIGRATE, - }, - - { - .name = "memory_pressure", - .read_u64 = cpuset_read_u64, - .private = FILE_MEMORY_PRESSURE, - }, - - { - .name = "memory_spread_page", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_PAGE, - }, - - { - .name = "memory_spread_slab", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_SLAB, - }, - - { - .name = "memory_pressure_enabled", - .flags = CFTYPE_ONLY_ON_ROOT, - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE_ENABLED, - }, - - { } /* terminate */ -}; - -/* * This is currently a minimal set for the default hierarchy. It can be * expanded later on by migrating more features and control files from v1. */ @@ -4019,14 +3377,12 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) } __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - nodes_clear(cs->mems_allowed); - nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (cpuset_v2()) __set_bit(CS_MEMORY_MIGRATE, &cs->flags); return &cs->css; @@ -4050,6 +3406,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); + /* + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (cpuset_v2() && !is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); @@ -4057,22 +3418,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; - cs->use_parent_ecpus = true; - parent->child_ecpus_count++; - /* - * Clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (!is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } - - /* - * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) @@ -4133,16 +3479,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (is_partition_valid(cs)) update_prstate(cs, 0); - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); - - cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } + if (!cpuset_v2() && is_sched_load_balance(cs)) + cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); @@ -4232,11 +3570,7 @@ static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) if (same_cs) return; - mutex_lock(&cpuset_mutex); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - mutex_unlock(&cpuset_mutex); + dec_attach_in_progress(cs); } /* @@ -4268,10 +3602,7 @@ static void cpuset_fork(struct task_struct *task) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - + dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } @@ -4288,7 +3619,9 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, - .legacy_cftypes = legacy_files, +#ifdef CONFIG_CPUSETS_V1 + .legacy_cftypes = cpuset1_files, +#endif .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, @@ -4317,78 +3650,18 @@ int __init cpuset_init(void) nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); - top_cpuset.relax_domain_level = -1; INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); - return 0; -} - -/* - * If CPU and/or memory hotplug handlers, below, unplug any CPUs - * or memory nodes, we need to walk over the cpuset hierarchy, - * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then move the tasks in the empty - * cpuset to its next-highest non-empty parent. - */ -static void remove_tasks_in_empty_cpuset(struct cpuset *cs) -{ - struct cpuset *parent; - - /* - * Find its next-highest non-empty parent, (top cpuset - * has online cpus, so can't be empty). - */ - parent = parent_cs(cs); - while (cpumask_empty(parent->cpus_allowed) || - nodes_empty(parent->mems_allowed)) - parent = parent_cs(parent); - - if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - pr_err("cpuset: failed to transfer tasks out of empty cpuset "); - pr_cont_cgroup_name(cs->css.cgroup); - pr_cont("\n"); + have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); + if (have_boot_isolcpus) { + BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); + cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); } -} - -static void -hotplug_update_tasks_legacy(struct cpuset *cs, - struct cpumask *new_cpus, nodemask_t *new_mems, - bool cpus_updated, bool mems_updated) -{ - bool is_empty; - - spin_lock_irq(&callback_lock); - cpumask_copy(cs->cpus_allowed, new_cpus); - cpumask_copy(cs->effective_cpus, new_cpus); - cs->mems_allowed = *new_mems; - cs->effective_mems = *new_mems; - spin_unlock_irq(&callback_lock); - /* - * Don't call update_tasks_cpumask() if the cpuset becomes empty, - * as the tasks will be migrated to an ancestor. - */ - if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs, new_cpus); - if (mems_updated && !nodes_empty(cs->mems_allowed)) - update_tasks_nodemask(cs); - - is_empty = cpumask_empty(cs->cpus_allowed) || - nodes_empty(cs->mems_allowed); - - /* - * Move tasks to the nearest ancestor with execution resources, - * This is full cgroup operation which will also call back into - * cpuset. Should be done outside any lock. - */ - if (is_empty) { - mutex_unlock(&cpuset_mutex); - remove_tasks_in_empty_cpuset(cs); - mutex_lock(&cpuset_mutex); - } + return 0; } static void @@ -4408,40 +3681,14 @@ hotplug_update_tasks(struct cpuset *cs, spin_unlock_irq(&callback_lock); if (cpus_updated) - update_tasks_cpumask(cs, new_cpus); + cpuset_update_tasks_cpumask(cs, new_cpus); if (mems_updated) - update_tasks_nodemask(cs); + cpuset_update_tasks_nodemask(cs); } -static bool force_rebuild; - void cpuset_force_rebuild(void) { - force_rebuild = true; -} - -/* - * Attempt to acquire a cpus_read_lock while a hotplug operation may be in - * progress. - * Return: true if successful, false otherwise - * - * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ -static bool cpuset_hotplug_cpus_read_trylock(void) -{ - int retries = 0; - - while (!cpus_read_trylock()) { - /* - * CPU hotplug still in progress. Retry 5 times - * with a 10ms wait before bailing out. - */ - if (++retries > 5) - return false; - msleep(10); - } - return true; + force_sd_rebuild = true; } /** @@ -4492,13 +3739,11 @@ retry: compute_partition_effective_cpumask(cs, &new_cpus); if (remote && cpumask_empty(&new_cpus) && - partition_is_populated(cs, NULL) && - cpuset_hotplug_cpus_read_trylock()) { + partition_is_populated(cs, NULL)) { remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; cpuset_force_rebuild(); - cpus_read_unlock(); } /* @@ -4518,18 +3763,8 @@ retry: else if (is_partition_valid(parent) && is_partition_invalid(cs)) partcmd = partcmd_update; - /* - * cpus_read_lock needs to be held before calling - * update_parent_effective_cpumask(). To avoid circular lock - * dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ if (partcmd >= 0) { - if (!cpuset_hotplug_cpus_read_trylock()) - goto update_tasks; - update_parent_effective_cpumask(cs, partcmd, NULL, tmp); - cpus_read_unlock(); if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); @@ -4549,7 +3784,7 @@ update_tasks: hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else - hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); unlock: @@ -4557,8 +3792,7 @@ unlock: } /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset - * @work: unused + * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -4572,8 +3806,10 @@ unlock: * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. + * + * CPU / memory hotplug is handled synchronously. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_handle_hotplug(void) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -4584,6 +3820,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; + lockdep_assert_cpus_held(); mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ @@ -4599,15 +3836,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work) !cpumask_empty(subpartitions_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); - /* - * In the rare case that hotplug removes all the cpus in - * subpartitions_cpus, we assumed that cpus are updated. - */ - if (!cpus_updated && top_cpuset.nr_subparts) - cpus_updated = true; - /* For v1, synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { + cpuset_force_rebuild(); spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); @@ -4638,7 +3869,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; spin_unlock_irq(&callback_lock); - update_tasks_nodemask(&top_cpuset); + cpuset_update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); @@ -4662,11 +3893,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rcu_read_unlock(); } - /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; - rebuild_sched_domains(); - } + /* rebuild sched domains if necessary */ + if (force_sd_rebuild) + rebuild_sched_domains_cpuslocked(); free_cpumasks(NULL, ptmp); } @@ -4678,12 +3907,7 @@ void cpuset_update_active_cpus(void) * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); } /* @@ -4694,7 +3918,7 @@ void cpuset_wait_for_hotplug(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - schedule_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); return NOTIFY_OK; } @@ -4983,19 +4207,6 @@ int cpuset_mem_spread_node(void) } /** - * cpuset_slab_spread_node() - On which node to begin search for a slab page - */ -int cpuset_slab_spread_node(void) -{ - if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) - current->cpuset_slab_spread_rotor = - node_random(¤t->mems_allowed); - - return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); -} -EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); - -/** * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? * @tsk1: pointer to task_struct of some task. * @tsk2: pointer to task_struct of some other task. @@ -5033,39 +4244,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -/* - * Collection of memory_pressure is suppressed unless - * this flag is enabled by writing "1" to the special - * cpuset file 'memory_pressure_enabled' in the root cpuset. - */ - -int cpuset_memory_pressure_enabled __read_mostly; - -/* - * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. - * - * Keep a running average of the rate of synchronous (direct) - * page reclaim efforts initiated by tasks in each cpuset. - * - * This represents the rate at which some task in the cpuset - * ran low on memory on all nodes it was allowed to use, and - * had to enter the kernels page reclaim code in an effort to - * create more free memory by tossing clean pages or swapping - * or writing dirty pages. - * - * Display to user space in the per-cpuset read-only file - * "memory_pressure". Value displayed is an integer - * representing the recent rate of entry into the synchronous - * (direct) page reclaim by any task attached to the cpuset. - */ - -void __cpuset_memory_pressure_bump(void) -{ - rcu_read_lock(); - fmeter_markevent(&task_cs(current)->fmeter); - rcu_read_unlock(); -} - #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() @@ -5088,10 +4266,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - css = task_get_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c new file mode 100644 index 000000000000..10b63433f057 --- /dev/null +++ b/kernel/cgroup/dmem.c @@ -0,0 +1,829 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>) + * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>) + * Partially based on the rdma and misc controllers, which bear the following copyrights: + * + * Copyright 2020 Google LLC + * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> + */ + +#include <linux/cgroup.h> +#include <linux/cgroup_dmem.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/page_counter.h> +#include <linux/parser.h> +#include <linux/slab.h> + +struct dmem_cgroup_region { + /** + * @ref: References keeping the region alive. + * Keeps the region reference alive after a succesful RCU lookup. + */ + struct kref ref; + + /** @rcu: RCU head for freeing */ + struct rcu_head rcu; + + /** + * @region_node: Linked into &dmem_cgroup_regions list. + * Protected by RCU and global spinlock. + */ + struct list_head region_node; + + /** + * @pools: List of pools linked to this region. + * Protected by global spinlock only + */ + struct list_head pools; + + /** @size: Size of region, in bytes */ + u64 size; + + /** @name: Name describing the node, set by dmem_cgroup_register_region */ + char *name; + + /** + * @unregistered: Whether the region is unregistered by its caller. + * No new pools should be added to the region afterwards. + */ + bool unregistered; +}; + +struct dmemcg_state { + struct cgroup_subsys_state css; + + struct list_head pools; +}; + +struct dmem_cgroup_pool_state { + struct dmem_cgroup_region *region; + struct dmemcg_state *cs; + + /* css node, RCU protected against region teardown */ + struct list_head css_node; + + /* dev node, no RCU protection required */ + struct list_head region_node; + + struct rcu_head rcu; + + struct page_counter cnt; + + bool inited; +}; + +/* + * 3 operations require locking protection: + * - Registering and unregistering region to/from list, requires global lock. + * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed. + * - Adding a dmem_cgroup_pool_state to a region list. + * + * Since for the most common operations RCU provides enough protection, I + * do not think more granular locking makes sense. Most protection is offered + * by RCU and the lockless operating page_counter. + */ +static DEFINE_SPINLOCK(dmemcg_lock); +static LIST_HEAD(dmem_cgroup_regions); + +static inline struct dmemcg_state * +css_to_dmemcs(struct cgroup_subsys_state *css) +{ + return container_of(css, struct dmemcg_state, css); +} + +static inline struct dmemcg_state *get_current_dmemcs(void) +{ + return css_to_dmemcs(task_get_css(current, dmem_cgrp_id)); +} + +static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg) +{ + return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL; +} + +static void free_cg_pool(struct dmem_cgroup_pool_state *pool) +{ + list_del(&pool->region_node); + kfree(pool); +} + +static void +set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_min(&pool->cnt, val); +} + +static void +set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_low(&pool->cnt, val); +} + +static void +set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_max(&pool->cnt, val); +} + +static u64 get_resource_low(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.low) : 0; +} + +static u64 get_resource_min(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.min) : 0; +} + +static u64 get_resource_max(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX; +} + +static u64 get_resource_current(struct dmem_cgroup_pool_state *pool) +{ + return pool ? page_counter_read(&pool->cnt) : 0; +} + +static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool) +{ + set_resource_min(rpool, 0); + set_resource_low(rpool, 0); + set_resource_max(rpool, PAGE_COUNTER_MAX); +} + +static void dmemcs_offline(struct cgroup_subsys_state *css) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(css); + struct dmem_cgroup_pool_state *pool; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &dmemcs->pools, css_node) + reset_all_resource_limits(pool); + rcu_read_unlock(); +} + +static void dmemcs_free(struct cgroup_subsys_state *css) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(css); + struct dmem_cgroup_pool_state *pool, *next; + + spin_lock(&dmemcg_lock); + list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) { + /* + *The pool is dead and all references are 0, + * no need for RCU protection with list_del_rcu or freeing. + */ + list_del(&pool->css_node); + free_cg_pool(pool); + } + spin_unlock(&dmemcg_lock); + + kfree(dmemcs); +} + +static struct cgroup_subsys_state * +dmemcs_alloc(struct cgroup_subsys_state *parent_css) +{ + struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL); + if (!dmemcs) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&dmemcs->pools); + return &dmemcs->css; +} + +static struct dmem_cgroup_pool_state * +find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region) +{ + struct dmem_cgroup_pool_state *pool; + + list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock)) + if (pool->region == region) + return pool; + + return NULL; +} + +static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool) +{ + if (!pool->cnt.parent) + return NULL; + + return container_of(pool->cnt.parent, typeof(*pool), cnt); +} + +static void +dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool) +{ + struct page_counter *climit; + struct cgroup_subsys_state *css; + struct dmemcg_state *dmemcg_iter; + struct dmem_cgroup_pool_state *pool, *found_pool; + + climit = &limit_pool->cnt; + + rcu_read_lock(); + + css_for_each_descendant_pre(css, &limit_pool->cs->css) { + dmemcg_iter = container_of(css, struct dmemcg_state, css); + found_pool = NULL; + + list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) { + if (pool->region == limit_pool->region) { + found_pool = pool; + break; + } + } + if (!found_pool) + continue; + + page_counter_calculate_protection( + climit, &found_pool->cnt, true); + + if (found_pool == test_pool) + break; + } + rcu_read_unlock(); +} + +/** + * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool + * @limit_pool: The pool for which we hit limits + * @test_pool: The pool for which to test + * @ignore_low: Whether we have to respect low watermarks. + * @ret_hit_low: Pointer to whether it makes sense to consider low watermark. + * + * This function returns true if we can evict from @test_pool, false if not. + * When returning false and @ignore_low is false, @ret_hit_low may + * be set to true to indicate this function can be retried with @ignore_low + * set to true. + * + * Return: bool + */ +bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool, + bool ignore_low, bool *ret_hit_low) +{ + struct dmem_cgroup_pool_state *pool = test_pool; + struct page_counter *ctest; + u64 used, min, low; + + /* Can always evict from current pool, despite limits */ + if (limit_pool == test_pool) + return true; + + if (limit_pool) { + if (!parent_dmemcs(limit_pool->cs)) + return true; + + for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool)) + {} + + if (!pool) + return false; + } else { + /* + * If there is no cgroup limiting memory usage, use the root + * cgroup instead for limit calculations. + */ + for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool)) + {} + } + + ctest = &test_pool->cnt; + + dmem_cgroup_calculate_protection(limit_pool, test_pool); + + used = page_counter_read(ctest); + min = READ_ONCE(ctest->emin); + + if (used <= min) + return false; + + if (!ignore_low) { + low = READ_ONCE(ctest->elow); + if (used > low) + return true; + + *ret_hit_low = true; + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable); + +static struct dmem_cgroup_pool_state * +alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, + struct dmem_cgroup_pool_state **allocpool) +{ + struct dmemcg_state *parent = parent_dmemcs(dmemcs); + struct dmem_cgroup_pool_state *pool, *ppool = NULL; + + if (!*allocpool) { + pool = kzalloc(sizeof(*pool), GFP_NOWAIT); + if (!pool) + return ERR_PTR(-ENOMEM); + } else { + pool = *allocpool; + *allocpool = NULL; + } + + pool->region = region; + pool->cs = dmemcs; + + if (parent) + ppool = find_cg_pool_locked(parent, region); + + page_counter_init(&pool->cnt, + ppool ? &ppool->cnt : NULL, true); + reset_all_resource_limits(pool); + + list_add_tail_rcu(&pool->css_node, &dmemcs->pools); + list_add_tail(&pool->region_node, ®ion->pools); + + if (!parent) + pool->inited = true; + else + pool->inited = ppool ? ppool->inited : false; + return pool; +} + +static struct dmem_cgroup_pool_state * +get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, + struct dmem_cgroup_pool_state **allocpool) +{ + struct dmem_cgroup_pool_state *pool, *ppool, *retpool; + struct dmemcg_state *p, *pp; + + /* + * Recursively create pool, we may not initialize yet on + * recursion, this is done as a separate step. + */ + for (p = dmemcs; p; p = parent_dmemcs(p)) { + pool = find_cg_pool_locked(p, region); + if (!pool) + pool = alloc_pool_single(p, region, allocpool); + + if (IS_ERR(pool)) + return pool; + + if (p == dmemcs && pool->inited) + return pool; + + if (pool->inited) + break; + } + + retpool = pool = find_cg_pool_locked(dmemcs, region); + for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) { + if (pool->inited) + break; + + /* ppool was created if it didn't exist by above loop. */ + ppool = find_cg_pool_locked(pp, region); + + /* Fix up parent links, mark as inited. */ + pool->cnt.parent = &ppool->cnt; + pool->inited = true; + + pool = ppool; + } + + return retpool; +} + +static void dmemcg_free_rcu(struct rcu_head *rcu) +{ + struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu); + struct dmem_cgroup_pool_state *pool, *next; + + list_for_each_entry_safe(pool, next, ®ion->pools, region_node) + free_cg_pool(pool); + kfree(region->name); + kfree(region); +} + +static void dmemcg_free_region(struct kref *ref) +{ + struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref); + + call_rcu(&cgregion->rcu, dmemcg_free_rcu); +} + +/** + * dmem_cgroup_unregister_region() - Unregister a previously registered region. + * @region: The region to unregister. + * + * This function undoes dmem_cgroup_register_region. + */ +void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) +{ + struct list_head *entry; + + if (!region) + return; + + spin_lock(&dmemcg_lock); + + /* Remove from global region list */ + list_del_rcu(®ion->region_node); + + list_for_each_rcu(entry, ®ion->pools) { + struct dmem_cgroup_pool_state *pool = + container_of(entry, typeof(*pool), region_node); + + list_del_rcu(&pool->css_node); + } + + /* + * Ensure any RCU based lookups fail. Additionally, + * no new pools should be added to the dead region + * by get_cg_pool_unlocked. + */ + region->unregistered = true; + spin_unlock(&dmemcg_lock); + + kref_put(®ion->ref, dmemcg_free_region); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region); + +/** + * dmem_cgroup_register_region() - Register a regions for dev cgroup. + * @size: Size of region to register, in bytes. + * @fmt: Region parameters to register + * + * This function registers a node in the dmem cgroup with the + * name given. After calling this function, the region can be + * used for allocations. + * + * Return: NULL or a struct on success, PTR_ERR on failure. + */ +struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...) +{ + struct dmem_cgroup_region *ret; + char *region_name; + va_list ap; + + if (!size) + return NULL; + + va_start(ap, fmt); + region_name = kvasprintf(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!region_name) + return ERR_PTR(-ENOMEM); + + ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) { + kfree(region_name); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&ret->pools); + ret->name = region_name; + ret->size = size; + kref_init(&ret->ref); + + spin_lock(&dmemcg_lock); + list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions); + spin_unlock(&dmemcg_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_register_region); + +static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name) +{ + struct dmem_cgroup_region *region; + + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock)) + if (!strcmp(name, region->name) && + kref_get_unless_zero(®ion->ref)) + return region; + + return NULL; +} + +/** + * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state + * @pool: &dmem_cgroup_pool_state + * + * Called to drop a reference to the limiting pool returned by + * dmem_cgroup_try_charge(). + */ +void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) +{ + if (pool) + css_put(&pool->cs->css); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put); + +static struct dmem_cgroup_pool_state * +get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) +{ + struct dmem_cgroup_pool_state *pool, *allocpool = NULL; + + /* fastpath lookup? */ + rcu_read_lock(); + pool = find_cg_pool_locked(cg, region); + if (pool && !READ_ONCE(pool->inited)) + pool = NULL; + rcu_read_unlock(); + + while (!pool) { + spin_lock(&dmemcg_lock); + if (!region->unregistered) + pool = get_cg_pool_locked(cg, region, &allocpool); + else + pool = ERR_PTR(-ENODEV); + spin_unlock(&dmemcg_lock); + + if (pool == ERR_PTR(-ENOMEM)) { + pool = NULL; + if (WARN_ON(allocpool)) + continue; + + allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL); + if (allocpool) { + pool = NULL; + continue; + } + } + } + + kfree(allocpool); + return pool; +} + +/** + * dmem_cgroup_uncharge() - Uncharge a pool. + * @pool: Pool to uncharge. + * @size: Size to uncharge. + * + * Undoes the effects of dmem_cgroup_try_charge. + * Must be called with the returned pool as argument, + * and same @index and @size. + */ +void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) +{ + if (!pool) + return; + + page_counter_uncharge(&pool->cnt, size); + css_put(&pool->cs->css); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge); + +/** + * dmem_cgroup_try_charge() - Try charging a new allocation to a region. + * @region: dmem region to charge + * @size: Size (in bytes) to charge. + * @ret_pool: On succesfull allocation, the pool that is charged. + * @ret_limit_pool: On a failed allocation, the limiting pool. + * + * This function charges the @region region for a size of @size bytes. + * + * If the function succeeds, @ret_pool is set, which must be passed to + * dmem_cgroup_uncharge() when undoing the allocation. + * + * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it + * will be set to the pool for which the limit is hit. This can be used for + * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed + * with @dmem_cgroup_pool_state_put(). + * + * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure. + */ +int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool) +{ + struct dmemcg_state *cg; + struct dmem_cgroup_pool_state *pool; + struct page_counter *fail; + int ret; + + *ret_pool = NULL; + if (ret_limit_pool) + *ret_limit_pool = NULL; + + /* + * hold on to css, as cgroup can be removed but resource + * accounting happens on css. + */ + cg = get_current_dmemcs(); + + pool = get_cg_pool_unlocked(cg, region); + if (IS_ERR(pool)) { + ret = PTR_ERR(pool); + goto err; + } + + if (!page_counter_try_charge(&pool->cnt, size, &fail)) { + if (ret_limit_pool) { + *ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt); + css_get(&(*ret_limit_pool)->cs->css); + } + ret = -EAGAIN; + goto err; + } + + /* On success, reference from get_current_dmemcs is transferred to *ret_pool */ + *ret_pool = pool; + return 0; + +err: + css_put(&cg->css); + return ret; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge); + +static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v) +{ + struct dmem_cgroup_region *region; + + rcu_read_lock(); + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { + seq_puts(sf, region->name); + seq_printf(sf, " %llu\n", region->size); + } + rcu_read_unlock(); + return 0; +} + +static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region, + u64 *new_limit) +{ + char *end; + + if (!strcmp(options, "max")) { + *new_limit = PAGE_COUNTER_MAX; + return 0; + } + + *new_limit = memparse(options, &end); + if (*end != '\0') + return -EINVAL; + + return 0; +} + +static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, + void (*apply)(struct dmem_cgroup_pool_state *, u64)) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of)); + int err = 0; + + while (buf && !err) { + struct dmem_cgroup_pool_state *pool = NULL; + char *options, *region_name; + struct dmem_cgroup_region *region; + u64 new_limit; + + options = buf; + buf = strchr(buf, '\n'); + if (buf) + *buf++ = '\0'; + + options = strstrip(options); + + /* eat empty lines */ + if (!options[0]) + continue; + + region_name = strsep(&options, " \t"); + if (!region_name[0]) + continue; + + rcu_read_lock(); + region = dmemcg_get_region_by_name(region_name); + rcu_read_unlock(); + + if (!region) + return -EINVAL; + + err = dmemcg_parse_limit(options, region, &new_limit); + if (err < 0) + goto out_put; + + pool = get_cg_pool_unlocked(dmemcs, region); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + goto out_put; + } + + /* And commit */ + apply(pool, new_limit); + +out_put: + kref_put(®ion->ref, dmemcg_free_region); + } + + + return err ?: nbytes; +} + +static int dmemcg_limit_show(struct seq_file *sf, void *v, + u64 (*fn)(struct dmem_cgroup_pool_state *)) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf)); + struct dmem_cgroup_region *region; + + rcu_read_lock(); + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { + struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region); + u64 val; + + seq_puts(sf, region->name); + + val = fn(pool); + if (val < PAGE_COUNTER_MAX) + seq_printf(sf, " %lld\n", val); + else + seq_puts(sf, " max\n"); + } + rcu_read_unlock(); + + return 0; +} + +static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_current); +} + +static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_min); +} + +static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min); +} + +static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_low); +} + +static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low); +} + +static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_max); +} + +static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max); +} + +static struct cftype files[] = { + { + .name = "capacity", + .seq_show = dmem_cgroup_region_capacity_show, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + { + .name = "current", + .seq_show = dmem_cgroup_region_current_show, + }, + { + .name = "min", + .write = dmem_cgroup_region_min_write, + .seq_show = dmem_cgroup_region_min_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "low", + .write = dmem_cgroup_region_low_write, + .seq_show = dmem_cgroup_region_low_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "max", + .write = dmem_cgroup_region_max_write, + .seq_show = dmem_cgroup_region_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* Zero entry terminates. */ +}; + +struct cgroup_subsys dmem_cgrp_subsys = { + .css_alloc = dmemcs_alloc, + .css_free = dmemcs_free, + .css_offline = dmemcs_offline, + .legacy_cftypes = files, + .dfl_cftypes = files, +}; diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 617861a54793..bf1690a167dd 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -9,6 +9,28 @@ #include <trace/events/cgroup.h> /* + * Update CGRP_FROZEN of cgroup.flag + * Return true if flags is updated; false if flags has no change + */ +static bool cgroup_update_frozen_flag(struct cgroup *cgrp, bool frozen) +{ + lockdep_assert_held(&css_set_lock); + + /* Already there? */ + if (test_bit(CGRP_FROZEN, &cgrp->flags) == frozen) + return false; + + if (frozen) + set_bit(CGRP_FROZEN, &cgrp->flags); + else + clear_bit(CGRP_FROZEN, &cgrp->flags); + + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); + return true; +} + +/* * Propagate the cgroup frozen state upwards by the cgroup tree. */ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) @@ -24,24 +46,16 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) while ((cgrp = cgroup_parent(cgrp))) { if (frozen) { cgrp->freezer.nr_frozen_descendants += desc; - if (!test_bit(CGRP_FROZEN, &cgrp->flags) && - test_bit(CGRP_FREEZE, &cgrp->flags) && - cgrp->freezer.nr_frozen_descendants == - cgrp->nr_descendants) { - set_bit(CGRP_FROZEN, &cgrp->flags); - cgroup_file_notify(&cgrp->events_file); - TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); - desc++; - } + if (!test_bit(CGRP_FREEZE, &cgrp->flags) || + (cgrp->freezer.nr_frozen_descendants != + cgrp->nr_descendants)) + continue; } else { cgrp->freezer.nr_frozen_descendants -= desc; - if (test_bit(CGRP_FROZEN, &cgrp->flags)) { - clear_bit(CGRP_FROZEN, &cgrp->flags); - cgroup_file_notify(&cgrp->events_file); - TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); - desc++; - } } + + if (cgroup_update_frozen_flag(cgrp, frozen)) + desc++; } } @@ -53,8 +67,6 @@ void cgroup_update_frozen(struct cgroup *cgrp) { bool frozen; - lockdep_assert_held(&css_set_lock); - /* * If the cgroup has to be frozen (CGRP_FREEZE bit set), * and all tasks are frozen and/or stopped, let's consider @@ -63,24 +75,9 @@ void cgroup_update_frozen(struct cgroup *cgrp) frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); - if (frozen) { - /* Already there? */ - if (test_bit(CGRP_FROZEN, &cgrp->flags)) - return; - - set_bit(CGRP_FROZEN, &cgrp->flags); - } else { - /* Already there? */ - if (!test_bit(CGRP_FROZEN, &cgrp->flags)) - return; - - clear_bit(CGRP_FROZEN, &cgrp->flags); - } - cgroup_file_notify(&cgrp->events_file); - TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); - - /* Update the state of ancestor cgroups. */ - cgroup_propagate_frozen(cgrp, frozen); + /* If flags is updated, update the state of ancestor cgroups. */ + if (cgroup_update_frozen_flag(cgrp, frozen)) + cgroup_propagate_frozen(cgrp, frozen); } /* @@ -260,8 +257,10 @@ void cgroup_freezer_migrate_task(struct task_struct *task, void cgroup_freeze(struct cgroup *cgrp, bool freeze) { struct cgroup_subsys_state *css; + struct cgroup *parent; struct cgroup *dsct; bool applied = false; + bool old_e; lockdep_assert_held(&cgroup_mutex); @@ -282,22 +281,18 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) if (cgroup_is_dead(dsct)) continue; - if (freeze) { - dsct->freezer.e_freeze++; - /* - * Already frozen because of ancestor's settings? - */ - if (dsct->freezer.e_freeze > 1) - continue; - } else { - dsct->freezer.e_freeze--; - /* - * Still frozen because of ancestor's settings? - */ - if (dsct->freezer.e_freeze > 0) - continue; - - WARN_ON_ONCE(dsct->freezer.e_freeze < 0); + /* + * e_freeze is affected by parent's e_freeze and dst's freeze. + * If old e_freeze eq new e_freeze, no change, its children + * will not be affected. So do nothing and skip the subtree + */ + old_e = dsct->freezer.e_freeze; + parent = cgroup_parent(dsct); + dsct->freezer.e_freeze = (dsct->freezer.freeze || + parent->freezer.e_freeze); + if (dsct->freezer.e_freeze == old_e) { + css = css_rightmost_descendant(css); + continue; } /* diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 66d1708042a7..074653f964c1 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -106,8 +106,7 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) * @css: css being created * * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. + * parent's freezing state while holding cpus read lock and freezer_mutex. */ static int freezer_css_online(struct cgroup_subsys_state *css) { @@ -133,7 +132,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) * freezer_css_offline - initiate destruction of a freezer css * @css: css being destroyed * - * @css is going away. Mark it dead and decrement system_freezing_count if + * @css is going away. Mark it dead and decrement freezer_active if * it was holding one. */ static void freezer_css_offline(struct cgroup_subsys_state *css) diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 79a3717a5803..0e26068995a6 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -121,6 +121,30 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, misc_res_name[type]); } +static void misc_cg_update_watermark(struct misc_res *res, u64 new_usage) +{ + u64 old; + + while (true) { + old = atomic64_read(&res->watermark); + if (new_usage <= old) + break; + if (atomic64_cmpxchg(&res->watermark, old, new_usage) == old) + break; + } +} + +static void misc_cg_event(enum misc_res_type type, struct misc_cg *cg) +{ + atomic64_inc(&cg->res[type].events_local); + cgroup_file_notify(&cg->events_local_file); + + for (; parent_misc(cg); cg = parent_misc(cg)) { + atomic64_inc(&cg->res[type].events); + cgroup_file_notify(&cg->events_file); + } +} + /** * misc_cg_try_charge() - Try charging the misc cgroup. * @type: Misc res type to charge. @@ -159,14 +183,12 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount) ret = -EBUSY; goto err_charge; } + misc_cg_update_watermark(res, new_usage); } return 0; err_charge: - for (j = i; j; j = parent_misc(j)) { - atomic64_inc(&j->res[type].events); - cgroup_file_notify(&j->events_file); - } + misc_cg_event(type, i); for (j = cg; j != i; j = parent_misc(j)) misc_cg_cancel_charge(type, j, amount); @@ -308,6 +330,29 @@ static int misc_cg_current_show(struct seq_file *sf, void *v) } /** + * misc_cg_peak_show() - Show the peak usage of the misc cgroup. + * @sf: Interface file + * @v: Arguments passed + * + * Context: Any context. + * Return: 0 to denote successful print. + */ +static int misc_cg_peak_show(struct seq_file *sf, void *v) +{ + int i; + u64 watermark; + struct misc_cg *cg = css_misc(seq_css(sf)); + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + watermark = atomic64_read(&cg->res[i].watermark); + if (READ_ONCE(misc_res_capacity[i]) || watermark) + seq_printf(sf, "%s %llu\n", misc_res_name[i], watermark); + } + + return 0; +} + +/** * misc_cg_capacity_show() - Show the total capacity of misc res on the host. * @sf: Interface file * @v: Arguments passed @@ -331,20 +376,33 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v) return 0; } -static int misc_events_show(struct seq_file *sf, void *v) +static int __misc_events_show(struct seq_file *sf, bool local) { struct misc_cg *cg = css_misc(seq_css(sf)); u64 events; int i; for (i = 0; i < MISC_CG_RES_TYPES; i++) { - events = atomic64_read(&cg->res[i].events); + if (local) + events = atomic64_read(&cg->res[i].events_local); + else + events = atomic64_read(&cg->res[i].events); if (READ_ONCE(misc_res_capacity[i]) || events) seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events); } return 0; } +static int misc_events_show(struct seq_file *sf, void *v) +{ + return __misc_events_show(sf, false); +} + +static int misc_events_local_show(struct seq_file *sf, void *v) +{ + return __misc_events_show(sf, true); +} + /* Misc cgroup interface files */ static struct cftype misc_cg_files[] = { { @@ -358,6 +416,10 @@ static struct cftype misc_cg_files[] = { .seq_show = misc_cg_current_show, }, { + .name = "peak", + .seq_show = misc_cg_peak_show, + }, + { .name = "capacity", .seq_show = misc_cg_capacity_show, .flags = CFTYPE_ONLY_ON_ROOT, @@ -368,6 +430,12 @@ static struct cftype misc_cg_files[] = { .file_offset = offsetof(struct misc_cg, events_file), .seq_show = misc_events_show, }, + { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct misc_cg, events_local_file), + .seq_show = misc_events_local_show, + }, {} }; diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 7695e60bcb40..8f61114c36dd 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -38,6 +38,14 @@ #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX_STR "max" +enum pidcg_event { + /* Fork failed in subtree because this pids_cgroup limit was hit. */ + PIDCG_MAX, + /* Fork failed in this pids_cgroup because ancestor limit was hit. */ + PIDCG_FORKFAIL, + NR_PIDCG_EVENTS, +}; + struct pids_cgroup { struct cgroup_subsys_state css; @@ -49,11 +57,12 @@ struct pids_cgroup { atomic64_t limit; int64_t watermark; - /* Handle for "pids.events" */ + /* Handles for pids.events[.local] */ struct cgroup_file events_file; + struct cgroup_file events_local_file; - /* Number of times fork failed because limit was hit. */ - atomic64_t events_limit; + atomic64_t events[NR_PIDCG_EVENTS]; + atomic64_t events_local[NR_PIDCG_EVENTS]; }; static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) @@ -75,9 +84,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - atomic64_set(&pids->counter, 0); atomic64_set(&pids->limit, PIDS_MAX); - atomic64_set(&pids->events_limit, 0); return &pids->css; } @@ -150,12 +157,13 @@ static void pids_charge(struct pids_cgroup *pids, int num) * pids_try_charge - hierarchically try to charge the pid count * @pids: the pid cgroup state * @num: the number of pids to charge + * @fail: storage of pid cgroup causing the fail * * This function follows the set limit. It will fail if the charge would cause * the new value to exceed the hierarchical limit. Returns 0 if the charge * succeeded, otherwise -EAGAIN. */ -static int pids_try_charge(struct pids_cgroup *pids, int num) +static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail) { struct pids_cgroup *p, *q; @@ -168,9 +176,10 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > limit) + if (new > limit) { + *fail = p; goto revert; - + } /* * Not technically accurate if we go over limit somewhere up * the hierarchy, but that's tolerable for the watermark. @@ -231,44 +240,54 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) } } +static void pids_event(struct pids_cgroup *pids_forking, + struct pids_cgroup *pids_over_limit) +{ + struct pids_cgroup *p = pids_forking; + + /* Only log the first time limit is hit. */ + if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) { + pr_info("cgroup: fork rejected by pids controller in "); + pr_cont_cgroup_path(p->css.cgroup); + pr_cont("\n"); + } + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) { + cgroup_file_notify(&p->events_local_file); + return; + } + + atomic64_inc(&pids_over_limit->events_local[PIDCG_MAX]); + cgroup_file_notify(&pids_over_limit->events_local_file); + + for (p = pids_over_limit; parent_pids(p); p = parent_pids(p)) { + atomic64_inc(&p->events[PIDCG_MAX]); + cgroup_file_notify(&p->events_file); + } +} + /* * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on cgroup_threadgroup_change_begin() held by the copy_process(). */ static int pids_can_fork(struct task_struct *task, struct css_set *cset) { - struct cgroup_subsys_state *css; - struct pids_cgroup *pids; + struct pids_cgroup *pids, *pids_over_limit; int err; - if (cset) - css = cset->subsys[pids_cgrp_id]; - else - css = task_css_check(current, pids_cgrp_id, true); - pids = css_pids(css); - err = pids_try_charge(pids, 1); - if (err) { - /* Only log the first time events_limit is incremented. */ - if (atomic64_inc_return(&pids->events_limit) == 1) { - pr_info("cgroup: fork rejected by pids controller in "); - pr_cont_cgroup_path(css->cgroup); - pr_cont("\n"); - } - cgroup_file_notify(&pids->events_file); - } + pids = css_pids(cset->subsys[pids_cgrp_id]); + err = pids_try_charge(pids, 1, &pids_over_limit); + if (err) + pids_event(pids, pids_over_limit); + return err; } static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) { - struct cgroup_subsys_state *css; struct pids_cgroup *pids; - if (cset) - css = cset->subsys[pids_cgrp_id]; - else - css = task_css_check(current, pids_cgrp_id, true); - pids = css_pids(css); + pids = css_pids(cset->subsys[pids_cgrp_id]); pids_uncharge(pids, 1); } @@ -339,11 +358,32 @@ static s64 pids_peak_read(struct cgroup_subsys_state *css, return READ_ONCE(pids->watermark); } -static int pids_events_show(struct seq_file *sf, void *v) +static int __pids_events_show(struct seq_file *sf, bool local) { struct pids_cgroup *pids = css_pids(seq_css(sf)); + enum pidcg_event pe = PIDCG_MAX; + atomic64_t *events; + + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) { + pe = PIDCG_FORKFAIL; + local = true; + } + events = local ? pids->events_local : pids->events; - seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); + seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe])); + return 0; +} + +static int pids_events_show(struct seq_file *sf, void *v) +{ + __pids_events_show(sf, false); + return 0; +} + +static int pids_events_local_show(struct seq_file *sf, void *v) +{ + __pids_events_show(sf, true); return 0; } @@ -370,9 +410,42 @@ static struct cftype pids_files[] = { .file_offset = offsetof(struct pids_cgroup, events_file), .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "events.local", + .seq_show = pids_events_local_show, + .file_offset = offsetof(struct pids_cgroup, events_local_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* terminate */ +}; + +static struct cftype pids_files_legacy[] = { + { + .name = "max", + .write = pids_max_write, + .seq_show = pids_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .read_s64 = pids_current_read, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "peak", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = pids_peak_read, + }, + { + .name = "events", + .seq_show = pids_events_show, + .file_offset = offsetof(struct pids_cgroup, events_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, { } /* terminate */ }; + struct cgroup_subsys pids_cgrp_subsys = { .css_alloc = pids_css_alloc, .css_free = pids_css_free, @@ -381,7 +454,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, .release = pids_release, - .legacy_cftypes = pids_files, + .legacy_cftypes = pids_files_legacy, .dfl_cftypes = pids_files, .threaded = true, }; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index a8350d2d63e6..aac91466279f 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -7,6 +7,8 @@ #include <linux/btf.h> #include <linux/btf_ids.h> +#include <trace/events/cgroup.h> + static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); @@ -17,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) return per_cpu_ptr(cgrp->rstat_cpu, cpu); } +/* + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @fast_path determine the + * tracepoints being added, allowing us to diagnose "flush" related + * operations without handling high-frequency fast-path "update" events. + */ +static __always_inline +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, const bool fast_path) +{ + unsigned long flags; + bool contended; + + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + contended = !raw_spin_trylock_irqsave(cpu_lock, flags); + if (contended) { + if (fast_path) + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); + + raw_spin_lock_irqsave(cpu_lock, flags); + } + + if (fast_path) + trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); + + return flags; +} + +static __always_inline +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, unsigned long flags, + const bool fast_path) +{ + if (fast_path) + trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); + else + trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + /** * cgroup_rstat_updated - keep track of updated rstat_cpu * @cgrp: target cgroup @@ -42,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); /* put @cgrp and all ancestors on the corresponding updated lists */ while (true) { @@ -70,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) cgrp = parent; } - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true); } /** @@ -151,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) struct cgroup *head = NULL, *parent, *child; unsigned long flags; - /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. - */ - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) @@ -196,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) if (child != root) head = cgroup_rstat_push_children(head, child, cpu); unlock_ret: - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false); return head; } @@ -222,6 +270,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __bpf_hook_end(); +/* + * Helper functions for locking cgroup_rstat_lock. + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @cpu_in_loop indicate lock + * was released and re-taken when collection data from the CPUs. The + * value -1 is used when obtaining the main lock else this is the CPU + * number processed last. + */ +static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) + __acquires(&cgroup_rstat_lock) +{ + bool contended; + + contended = !spin_trylock_irq(&cgroup_rstat_lock); + if (contended) { + trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); + spin_lock_irq(&cgroup_rstat_lock); + } + trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); +} + +static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) + __releases(&cgroup_rstat_lock) +{ + trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); + spin_unlock_irq(&cgroup_rstat_lock); +} + /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) @@ -248,10 +325,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) /* play nice and yield if necessary */ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, cpu); if (!cond_resched()) cpu_relax(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, cpu); } } } @@ -273,9 +350,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } /** @@ -291,17 +368,18 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp) __acquires(&cgroup_rstat_lock) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); } /** * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() + * @cgrp: cgroup used by tracepoint */ -void cgroup_rstat_flush_release(void) +void cgroup_rstat_flush_release(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } int cgroup_rstat_init(struct cgroup *cgrp) @@ -366,6 +444,7 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum += src_bstat->forceidle_sum; #endif + dst_bstat->ntime += src_bstat->ntime; } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, @@ -377,6 +456,7 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; #endif + dst_bstat->ntime -= src_bstat->ntime; } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) @@ -456,8 +536,10 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { - case CPUTIME_USER: case CPUTIME_NICE: + rstatc->bstat.ntime += delta_exec; + fallthrough; + case CPUTIME_USER: rstatc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: @@ -508,64 +590,65 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; - cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; #endif + bstat->ntime += cpustat[CPUTIME_NICE]; } } -void cgroup_base_stat_cputime_show(struct seq_file *seq) + +static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat) { - struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime; - struct cgroup_base_stat bstat; #ifdef CONFIG_SCHED_CORE - u64 forceidle_time; + u64 forceidle_time = bstat->forceidle_sum; + + do_div(forceidle_time, NSEC_PER_USEC); + seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); #endif +} + +void cgroup_base_stat_cputime_show(struct seq_file *seq) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 usage, utime, stime, ntime; if (cgroup_parent(cgrp)) { cgroup_rstat_flush_hold(cgrp); usage = cgrp->bstat.cputime.sum_exec_runtime; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); -#ifdef CONFIG_SCHED_CORE - forceidle_time = cgrp->bstat.forceidle_sum; -#endif - cgroup_rstat_flush_release(); + ntime = cgrp->bstat.ntime; + cgroup_rstat_flush_release(cgrp); } else { - root_cgroup_cputime(&bstat); - usage = bstat.cputime.sum_exec_runtime; - utime = bstat.cputime.utime; - stime = bstat.cputime.stime; -#ifdef CONFIG_SCHED_CORE - forceidle_time = bstat.forceidle_sum; -#endif + /* cgrp->bstat of root is not actually used, reuse it */ + root_cgroup_cputime(&cgrp->bstat); + usage = cgrp->bstat.cputime.sum_exec_runtime; + utime = cgrp->bstat.cputime.utime; + stime = cgrp->bstat.cputime.stime; + ntime = cgrp->bstat.ntime; } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); do_div(stime, NSEC_PER_USEC); -#ifdef CONFIG_SCHED_CORE - do_div(forceidle_time, NSEC_PER_USEC); -#endif + do_div(ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" - "user_usec %llu\n" - "system_usec %llu\n", - usage, utime, stime); + "user_usec %llu\n" + "system_usec %llu\n" + "nice_usec %llu\n", + usage, utime, stime, ntime); -#ifdef CONFIG_SCHED_CORE - seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); -#endif + cgroup_force_idle_show(seq, &cgrp->bstat); } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ -BTF_SET8_START(bpf_rstat_kfunc_ids) +BTF_KFUNCS_START(bpf_rstat_kfunc_ids) BTF_ID_FLAGS(func, cgroup_rstat_updated) BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) -BTF_SET8_END(bpf_rstat_kfunc_ids) +BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { .owner = THIS_MODULE, |