diff options
Diffstat (limited to 'kernel/cgroup/cpuset.c')
-rw-r--r-- | kernel/cgroup/cpuset.c | 2161 |
1 files changed, 723 insertions, 1438 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4237c8748715..3bc4301466f3 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,11 +21,8 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ +#include "cpuset-internal.h" -#include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/cpuset.h> -#include <linux/delay.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> @@ -39,10 +36,8 @@ #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/security.h> -#include <linux/spinlock.h> #include <linux/oom.h> #include <linux/sched/isolation.h> -#include <linux/cgroup.h> #include <linux/wait.h> #include <linux/workqueue.h> @@ -56,30 +51,6 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); -/* See "Frequency meter" comments, below. */ - -struct fmeter { - int cnt; /* unprocessed events count */ - int val; /* most recent output value */ - time64_t time; /* clock (secs) when val computed */ - spinlock_t lock; /* guards read or write of above */ -}; - -/* - * Invalid partition error code - */ -enum prs_errcode { - PERR_NONE = 0, - PERR_INVCPUS, - PERR_INVPARENT, - PERR_NOTPART, - PERR_NOTEXCL, - PERR_NOCPUS, - PERR_HOTPLUG, - PERR_CPUSEMPTY, - PERR_HKEEPING, -}; - static const char * const perr_strings[] = { [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", [PERR_INVPARENT] = "Parent is an invalid partition root", @@ -87,122 +58,20 @@ static const char * const perr_strings[] = { [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", - [PERR_CPUSEMPTY] = "cpuset.cpus is empty", + [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", + [PERR_ACCESS] = "Enable partition not permitted", + [PERR_REMOTE] = "Have remote partition underneath", }; -struct cpuset { - struct cgroup_subsys_state css; - - unsigned long flags; /* "unsigned long" so bitops work */ - - /* - * On default hierarchy: - * - * The user-configured masks can only be changed by writing to - * cpuset.cpus and cpuset.mems, and won't be limited by the - * parent masks. - * - * The effective masks is the real masks that apply to the tasks - * in the cpuset. They may be changed if the configured masks are - * changed or hotplug happens. - * - * effective_mask == configured_mask & parent's effective_mask, - * and if it ends up empty, it will inherit the parent's mask. - * - * - * On legacy hierarchy: - * - * The user-configured masks are always the same with effective masks. - */ - - /* user-configured CPUs and Memory Nodes allow to tasks */ - cpumask_var_t cpus_allowed; - nodemask_t mems_allowed; - - /* effective CPUs and Memory Nodes allow to tasks */ - cpumask_var_t effective_cpus; - nodemask_t effective_mems; - - /* - * Exclusive CPUs dedicated to current cgroup (default hierarchy only) - * - * This exclusive CPUs must be a subset of cpus_allowed. A parent - * cgroup can only grant exclusive CPUs to one of its children. - * - * When the cgroup becomes a valid partition root, effective_xcpus - * defaults to cpus_allowed if not set. The effective_cpus of a valid - * partition root comes solely from its effective_xcpus and some of the - * effective_xcpus may be distributed to sub-partitions below & hence - * excluded from its effective_cpus. - */ - cpumask_var_t effective_xcpus; - - /* - * Exclusive CPUs as requested by the user (default hierarchy only) - */ - cpumask_var_t exclusive_cpus; - - /* - * This is old Memory Nodes tasks took on. - * - * - top_cpuset.old_mems_allowed is initialized to mems_allowed. - * - A new cpuset's old_mems_allowed is initialized when some - * task is moved into it. - * - old_mems_allowed is used in cpuset_migrate_mm() when we change - * cpuset.mems_allowed and have tasks' nodemask updated, and - * then old_mems_allowed is updated to mems_allowed. - */ - nodemask_t old_mems_allowed; - - struct fmeter fmeter; /* memory_pressure filter */ - - /* - * Tasks are being attached to this cpuset. Used to prevent - * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). - */ - int attach_in_progress; - - /* partition number for rebuild_sched_domains() */ - int pn; - - /* for custom sched domain */ - int relax_domain_level; - - /* number of valid sub-partitions */ - int nr_subparts; - - /* partition root state */ - int partition_root_state; - - /* - * Default hierarchy only: - * use_parent_ecpus - set if using parent's effective_cpus - * child_ecpus_count - # of children with use_parent_ecpus set - */ - int use_parent_ecpus; - int child_ecpus_count; - - /* - * number of SCHED_DEADLINE tasks attached to this cpuset, so that we - * know when to rebuild associated root domain bandwidth information. - */ - int nr_deadline_tasks; - int nr_migrate_dl_tasks; - u64 sum_migrate_dl_bw; - - /* Invalid partition error code, not lock protected */ - enum prs_errcode prs_err; - - /* Handle for cpuset.cpus.partition */ - struct cgroup_file partition_file; - - /* Remote partition silbling list anchored at remote_children */ - struct list_head remote_sibling; -}; - /* - * Exclusive CPUs distributed out to sub-partitions of top_cpuset + * For local partitions, update to subpartitions_cpus & isolated_cpus is done + * in update_parent_effective_cpumask(). For remote partitions, it is done in + * the remote_partition_*() and remote_cpus_update() helpers. + */ +/* + * Exclusive CPUs distributed out to local or remote sub-partitions of + * top_cpuset */ static cpumask_var_t subpartitions_cpus; @@ -211,10 +80,32 @@ static cpumask_var_t subpartitions_cpus; */ static cpumask_var_t isolated_cpus; +/* + * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot + */ +static cpumask_var_t boot_hk_cpus; +static bool have_boot_isolcpus; + /* List of remote partition root children */ static struct list_head remote_children; /* + * A flag to force sched domain rebuild at the end of an operation. + * It can be set in + * - update_partition_sd_lb() + * - update_cpumasks_hier() + * - cpuset_update_flag() + * - cpuset_hotplug_update_tasks() + * - cpuset_handle_hotplug() + * + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * + * Note that update_relax_domain_level() in cpuset-v1.c can still call + * rebuild_sched_domains_locked() directly without using this flag. + */ +static bool force_sd_rebuild; + +/* * Partition root states: * * 0 - member (not a partition root) @@ -222,6 +113,17 @@ static struct list_head remote_children; * 2 - partition root without load balancing (isolated) * -1 - invalid partition root * -2 - invalid isolated partition root + * + * There are 2 types of partitions - local or remote. Local partitions are + * those whose parents are partition root themselves. Setting of + * cpuset.cpus.exclusive are optional in setting up local partitions. + * Remote partitions are those whose parents are not partition roots. Passing + * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor + * nodes are mandatory in creating a remote partition. + * + * For simplicity, a local partition can be created under a local or remote + * partition but a remote partition cannot have any partition root in its + * ancestor chain except the cgroup root. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -243,22 +145,6 @@ struct tmpmasks { cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; -static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct cpuset, css) : NULL; -} - -/* Retrieve the cpuset for a task */ -static inline struct cpuset *task_cs(struct task_struct *task) -{ - return css_cs(task_css(task, cpuset_cgrp_id)); -} - -static inline struct cpuset *parent_cs(struct cpuset *cs) -{ - return css_cs(cs->css.parent); -} - void inc_dl_tasks_cs(struct task_struct *p) { struct cpuset *cs = task_cs(p); @@ -273,59 +159,6 @@ void dec_dl_tasks_cs(struct task_struct *p) cs->nr_deadline_tasks--; } -/* bits in struct cpuset flags field */ -typedef enum { - CS_ONLINE, - CS_CPU_EXCLUSIVE, - CS_MEM_EXCLUSIVE, - CS_MEM_HARDWALL, - CS_MEMORY_MIGRATE, - CS_SCHED_LOAD_BALANCE, - CS_SPREAD_PAGE, - CS_SPREAD_SLAB, -} cpuset_flagbits_t; - -/* convenient tests for these bits */ -static inline bool is_cpuset_online(struct cpuset *cs) -{ - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); -} - -static inline int is_cpu_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_hardwall(const struct cpuset *cs) -{ - return test_bit(CS_MEM_HARDWALL, &cs->flags); -} - -static inline int is_sched_load_balance(const struct cpuset *cs) -{ - return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); -} - -static inline int is_memory_migrate(const struct cpuset *cs) -{ - return test_bit(CS_MEMORY_MIGRATE, &cs->flags); -} - -static inline int is_spread_page(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_PAGE, &cs->flags); -} - -static inline int is_spread_slab(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_SLAB, &cs->flags); -} - static inline int is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; @@ -359,47 +192,32 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) WRITE_ONCE(cs->prs_err, PERR_NONE); } +/* + * The top_cpuset is always synchronized to cpu_active_mask and we should avoid + * using cpu_online_mask as much as possible. An active CPU is always an online + * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ + * during hotplug operations. A CPU is marked active at the last stage of CPU + * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code + * will be called to update the sched domains so that the scheduler can move + * a normal task to a newly active CPU or remove tasks away from a newly + * inactivated CPU. The online bit is set much earlier in the CPU bringup + * process and cleared much later in CPU teardown. + * + * If cpu_online_mask is used while a hotunplug operation is happening in + * parallel, we may leave an offline CPU in cpu_allowed or some other masks. + */ static struct cpuset top_cpuset = { - .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | - (1 << CS_MEM_EXCLUSIVE)), + .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, + .relax_domain_level = -1, .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; -/** - * cpuset_for_each_child - traverse online children of a cpuset - * @child_cs: loop cursor pointing to the current child - * @pos_css: used for iteration - * @parent_cs: target cpuset to walk children of - * - * Walk @child_cs through the online children of @parent_cs. Must be used - * with RCU read locked. - */ -#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ - css_for_each_child((pos_css), &(parent_cs)->css) \ - if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) - -/** - * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants - * @des_cs: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @root_cs: target cpuset to walk ancestor of - * - * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_css by calling - * css_rightmost_descendant() to skip subtree. @root_cs is included in the - * iteration and the first node to be visited. - */ -#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ - css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ - if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) - /* * There are two global locks guarding cpuset structures - cpuset_mutex and - * callback_lock. We also require taking task_lock() when dereferencing a - * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems - * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset + * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel + * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset * structures. Note that cpuset_mutex needs to be a mutex as it is used in * paths that rely on priority inheritance (e.g. scheduler - on RT) for * correctness. @@ -425,12 +243,9 @@ static struct cpuset top_cpuset = { * by other task, we use alloc_lock in the task_struct fields to protect * them. * - * The cpuset_common_file_read() handlers only hold callback_lock across + * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. - * - * Accessing a task's cpuset should be done in accordance with the - * guidelines for accessing subsystem state in kernel/cgroup.c */ static DEFINE_MUTEX(cpuset_mutex); @@ -447,13 +262,17 @@ void cpuset_unlock(void) static DEFINE_SPINLOCK(callback_lock); -static struct workqueue_struct *cpuset_migrate_mm_wq; +void cpuset_callback_lock_irq(void) +{ + spin_lock_irq(&callback_lock); +} -/* - * CPU / memory hotplug is handled asynchronously. - */ -static void cpuset_hotplug_workfn(struct work_struct *work); -static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); +void cpuset_callback_unlock_irq(void) +{ + spin_unlock_irq(&callback_lock); +} + +static struct workqueue_struct *cpuset_migrate_mm_wq; static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); @@ -469,6 +288,32 @@ static inline void check_insane_mems_config(nodemask_t *nodes) } /* + * decrease cs->attach_in_progress. + * wake_up cpuset_attach_wq if cs->attach_in_progress==0. + */ +static inline void dec_attach_in_progress_locked(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +} + +static inline void dec_attach_in_progress(struct cpuset *cs) +{ + mutex_lock(&cpuset_mutex); + dec_attach_in_progress_locked(cs); + mutex_unlock(&cpuset_mutex); +} + +static inline bool cpuset_v2(void) +{ + return !IS_ENABLED(CONFIG_CPUSETS_V1) || + cgroup_subsys_on_dfl(cpuset_cgrp_subsys); +} + +/* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. @@ -478,7 +323,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) */ static inline bool is_in_v2_mode(void) { - return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + return cpuset_v2() || (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } @@ -524,38 +369,26 @@ static inline bool partition_is_populated(struct cpuset *cs, * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_mask. + * of cpu_active_mask. * * Call with callback_lock or cpuset_mutex held. */ -static void guarantee_online_cpus(struct task_struct *tsk, +static void guarantee_active_cpus(struct task_struct *tsk, struct cpumask *pmask) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); struct cpuset *cs; - if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) - cpumask_copy(pmask, cpu_online_mask); + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) + cpumask_copy(pmask, cpu_active_mask); rcu_read_lock(); cs = task_cs(tsk); - while (!cpumask_intersects(cs->effective_cpus, pmask)) { + while (!cpumask_intersects(cs->effective_cpus, pmask)) cs = parent_cs(cs); - if (unlikely(!cs)) { - /* - * The top cpuset doesn't have any online cpu as a - * consequence of a race between cpuset_hotplug_work - * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to be - * identical to cpu_online_mask. - */ - goto out_unlock; - } - } - cpumask_and(pmask, pmask, cs->effective_cpus); -out_unlock: + cpumask_and(pmask, pmask, cs->effective_cpus); rcu_read_unlock(); } @@ -577,45 +410,6 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } -/* - * update task's spread flag if cpuset's page/slab spread flag is set - * - * Call with callback_lock or cpuset_mutex held. The check can be skipped - * if on default hierarchy. - */ -static void cpuset_update_task_spread_flags(struct cpuset *cs, - struct task_struct *tsk) -{ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) - return; - - if (is_spread_page(cs)) - task_set_spread_page(tsk); - else - task_clear_spread_page(tsk); - - if (is_spread_slab(cs)) - task_set_spread_slab(tsk); - else - task_clear_spread_slab(tsk); -} - -/* - * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? - * - * One cpuset is a subset of another if all its allowed CPUs and - * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_mutex. - */ - -static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) -{ - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && - nodes_subset(p->mems_allowed, q->mems_allowed) && - is_cpu_exclusive(p) <= is_cpu_exclusive(q) && - is_mem_exclusive(p) <= is_mem_exclusive(q); -} - /** * alloc_cpumasks - allocate three cpumasks for cpuset * @cs: the cpuset that have cpumasks to be allocated. @@ -718,11 +512,17 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } -static inline struct cpumask *fetch_xcpus(struct cpuset *cs) +/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) { - return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : - cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed - : cs->effective_xcpus; + return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed + : cs->exclusive_cpus; +} + +static inline bool xcpus_empty(struct cpuset *cs) +{ + return cpumask_empty(cs->cpus_allowed) && + cpumask_empty(cs->exclusive_cpus); } /* @@ -732,8 +532,8 @@ static inline struct cpumask *fetch_xcpus(struct cpuset *cs) */ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) { - struct cpumask *xcpus1 = fetch_xcpus(cs1); - struct cpumask *xcpus2 = fetch_xcpus(cs2); + struct cpumask *xcpus1 = user_xcpus(cs1); + struct cpumask *xcpus2 = user_xcpus(cs2); if (cpumask_intersects(xcpus1, xcpus2)) return false; @@ -741,35 +541,6 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) } /* - * validate_change_legacy() - Validate conditions specific to legacy (v1) - * behavior. - */ -static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) -{ - struct cgroup_subsys_state *css; - struct cpuset *c, *par; - int ret; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* Each of our child cpusets must be a subset of us */ - ret = -EBUSY; - cpuset_for_each_child(c, css, cur) - if (!is_cpuset_subset(c, trial)) - goto out; - - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - par = parent_cs(cur); - if (par && !is_cpuset_subset(trial, par)) - goto out; - - ret = 0; -out: - return ret; -} - -/* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * @@ -798,7 +569,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) rcu_read_lock(); if (!is_in_v2_mode()) - ret = validate_change_legacy(cur, trial); + ret = cpuset1_validate_change(cur, trial); if (ret) goto out; @@ -824,27 +595,63 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* * We can't shrink if we won't have enough room for SCHED_DEADLINE - * tasks. + * tasks. This check is not done when scheduling is disabled as the + * users should know what they are doing. + * + * For v1, effective_cpus == cpus_allowed & user_xcpus() returns + * cpus_allowed. + * + * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only + * for non-isolated partition root. At this point, the target + * effective_cpus isn't computed yet. user_xcpus() is the best + * approximation. + * + * TBD: May need to precompute the real effective_cpus here in case + * incorrect scheduling of SCHED_DEADLINE tasks in a partition + * becomes an issue. */ ret = -EBUSY; - if (is_cpu_exclusive(cur) && - !cpuset_cpumask_can_shrink(cur->cpus_allowed, - trial->cpus_allowed)) + if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) && + !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial))) goto out; /* * If either I or some sibling (!= me) is exclusive, we can't - * overlap + * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur) { + bool txset, cxset; /* Are exclusive_cpus set? */ + + if (c == cur) + continue; + + txset = !cpumask_empty(trial->exclusive_cpus); + cxset = !cpumask_empty(c->exclusive_cpus); + if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || + (txset && cxset)) { if (!cpusets_are_exclusive(trial, c)) goto out; + } else if (txset || cxset) { + struct cpumask *xcpus, *acpus; + + /* + * When just one of the exclusive_cpus's is set, + * cpus_allowed of the other cpuset, if set, cannot be + * a subset of it or none of those CPUs will be + * available if these exclusive CPUs are activated. + */ + if (txset) { + xcpus = trial->exclusive_cpus; + acpus = c->cpus_allowed; + } else { + xcpus = c->exclusive_cpus; + acpus = trial->cpus_allowed; + } + if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) + goto out; } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) goto out; } @@ -940,18 +747,15 @@ static inline int nr_cpusets(void) * were changed (added or removed.) * * Finding the best partition (set of domains): - * The triple nested loops below over i, j, k scan over the - * load balanced cpusets (using the array of cpuset pointers in - * csa[]) looking for pairs of cpusets that have overlapping - * cpus_allowed, but which don't have the same 'pn' partition - * number and gives them in the same partition number. It keeps - * looping on the 'restart' label until it can no longer find - * any such pairs. + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). * - * The union of the cpus_allowed masks from the set of - * all cpusets having the same 'pn' value then form the one - * element of the partition (one sched domain) to be passed to - * partition_sched_domains(). */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) @@ -959,20 +763,23 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ - int i, j, k; /* indices for partition finding loops */ + int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); + bool cgrpv2 = cpuset_v2(); + int nslot_update; doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts) { + if (root_load_balance && cpumask_empty(subpartitions_cpus)) { +single_root_domain: ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -1000,16 +807,18 @@ static int generate_sched_domains(cpumask_var_t **domains, cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; + + if (cgrpv2) + goto v2; + /* + * v1: * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. - * - * If root is load-balancing, we can skip @cp if it - * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && @@ -1017,47 +826,62 @@ static int generate_sched_domains(cpumask_var_t **domains, housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; - if (root_load_balance && - cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) - continue; - if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; - /* skip @cp's subtree if not a partition root */ - if (!is_partition_valid(cp)) + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + +v2: + /* + * Only valid partition roots that are not isolated and with + * non-empty effective_cpus will be saved into csn[]. + */ + if ((cp->partition_root_state == PRS_ROOT) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* + * Skip @cp's subtree if not a partition root and has no + * exclusive CPUs to be granted to child cpusets. + */ + if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); + /* + * If there are only isolated partitions underneath the cgroup root, + * we can optimize out unneeded sched domains scanning. + */ + if (root_load_balance && (csn == 1)) + goto single_root_domain; + for (i = 0; i < csn; i++) - csa[i]->pn = i; - ndoms = csn; + uf_node_init(&csa[i]->node); -restart: - /* Find the best partition (set of sched domains) */ + /* Merge overlapping cpusets */ for (i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - int apn = a->pn; - - for (j = 0; j < csn; j++) { - struct cpuset *b = csa[j]; - int bpn = b->pn; - - if (apn != bpn && cpusets_overlap(a, b)) { - for (k = 0; k < csn; k++) { - struct cpuset *c = csa[k]; - - if (c->pn == bpn) - c->pn = apn; - } - ndoms--; /* one less element */ - goto restart; + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) { + /* + * Cgroup v2 shouldn't pass down overlapping + * partition root cpusets. + */ + WARN_ON_ONCE(cgrpv2); + uf_union(&csa[i]->node, &csa[j]->node); } } } + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. @@ -1073,45 +897,48 @@ restart: dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL); - for (nslot = 0, i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - struct cpumask *dp; - int apn = a->pn; - - if (apn < 0) { - /* Skip completed partitions */ - continue; - } - - dp = doms[nslot]; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; + /* + * Cgroup v2 doesn't support domain attributes, just set all of them + * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a + * subset of HK_TYPE_DOMAIN housekeeping CPUs. + */ + if (cgrpv2) { + for (i = 0; i < ndoms; i++) { + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (csa[i] == &top_cpuset) + cpumask_and(doms[i], csa[i]->effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; } + goto done; + } - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; + for (nslot = 0, i = 0; i < csn; i++) { + nslot_update = 0; for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpumask_or(dp, dp, b->effective_cpus); + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; + + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) - update_domain_attr_tree(dattr + nslot, b); - - /* Done with this partition */ - b->pn = -1; + update_domain_attr_tree(dattr + nslot, csa[j]); } } - nslot++; + if (nslot_update) + nslot++; } BUG_ON(nslot != ndoms); @@ -1146,10 +973,12 @@ static void dl_update_tasks_root_domain(struct cpuset *cs) css_task_iter_end(&it); } -static void dl_rebuild_rd_accounting(void) +void dl_rebuild_rd_accounting(void) { struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; + int cpu; + u64 cookie = ++dl_cookie; lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); @@ -1157,11 +986,12 @@ static void dl_rebuild_rd_accounting(void) rcu_read_lock(); - /* - * Clear default root domain DL accounting, it will be computed again - * if a task belongs to it. - */ - dl_clear_root_domain(&def_root_domain); + for_each_possible_cpu(cpu) { + if (dl_bw_visited(cpu, cookie)) + continue; + + dl_clear_root_domain_cpu(cpu); + } cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { @@ -1182,16 +1012,6 @@ static void dl_rebuild_rd_accounting(void) rcu_read_unlock(); } -static void -partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ - mutex_lock(&sched_domains_mutex); - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - dl_rebuild_rd_accounting(); - mutex_unlock(&sched_domains_mutex); -} - /* * Rebuild scheduler domains. * @@ -1203,7 +1023,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * * Call with cpuset_mutex held. Takes cpus_read_lock(). */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; @@ -1213,11 +1033,12 @@ static void rebuild_sched_domains_locked(void) lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); + force_sd_rebuild = false; /* * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * Anyways, cpuset_handle_hotplug() will rebuild sched domains. * * With no CPUs in any subpartitions, top_cpuset's effective CPUs * should be the same as the active CPUs, so checking only top_cpuset @@ -1232,7 +1053,7 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts) { + if (!cpumask_empty(subpartitions_cpus)) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { @@ -1252,35 +1073,52 @@ static void rebuild_sched_domains_locked(void) ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); + partition_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ -void rebuild_sched_domains(void) +static void rebuild_sched_domains_cpuslocked(void) { - cpus_read_lock(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); +} + +void rebuild_sched_domains(void) +{ + cpus_read_lock(); + rebuild_sched_domains_cpuslocked(); cpus_read_unlock(); } +void cpuset_reset_sched_domains(void) +{ + mutex_lock(&cpuset_mutex); + partition_sched_domains(1, NULL, NULL); + mutex_unlock(&cpuset_mutex); +} + /** - * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. + * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, - * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() - * is used instead of effective_cpus to make sure all offline CPUs are also - * included as hotplug code won't update cpumasks for tasks in top_cpuset. + * cpuset membership stays stable. + * + * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus + * to make sure all offline CPUs are also included as hotplug code won't + * update cpumasks for tasks in top_cpuset. + * + * As task_cpu_possible_mask() can be task dependent in arm64, we have to + * do cpu masking per task instead of doing it once for all. */ -static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; @@ -1292,9 +1130,11 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) if (top_cs) { /* - * Percpu kthreads in top_cpuset are ignored + * PF_NO_SETAFFINITY tasks are ignored. + * All per cpu kthreads should have PF_NO_SETAFFINITY + * flag set, see kthread_set_per_cpu(). */ - if (kthread_is_per_cpu(task)) + if (task->flags & PF_NO_SETAFFINITY) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { @@ -1330,8 +1170,6 @@ enum partition_cmd { partcmd_invalidate, /* Make partition invalid */ }; -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, - int turning_on); static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp); @@ -1340,16 +1178,16 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * * Return: 0 if successful, an error code otherwise */ -static int update_partition_exclusive(struct cpuset *cs, int new_prs) +static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs) { - bool exclusive = (new_prs > 0); + bool exclusive = (new_prs > PRS_MEMBER); if (exclusive && !is_cpu_exclusive(cs)) { - if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) + if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1)) return PERR_NOTEXCL; } else if (!exclusive && is_cpu_exclusive(cs)) { /* Turning off CS_CPU_EXCLUSIVE will not return error */ - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0); } return 0; } @@ -1385,7 +1223,7 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) } if (rebuild_domains) - rebuild_sched_domains_locked(); + cpuset_force_rebuild(); } /* @@ -1407,7 +1245,7 @@ static void reset_partition_data(struct cpuset *cs) { struct cpuset *parent = parent_cs(cs); - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (!cpuset_v2()) return; lockdep_assert_held(&callback_lock); @@ -1418,21 +1256,17 @@ static void reset_partition_data(struct cpuset *cs) if (is_cpu_exclusive(cs)) clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); } - if (!cpumask_and(cs->effective_cpus, - parent->effective_cpus, cs->cpus_allowed)) { - cs->use_parent_ecpus = true; - parent->child_ecpus_count++; + if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) cpumask_copy(cs->effective_cpus, parent->effective_cpus); - } } /* - * partition_xcpus_newstate - Exclusive CPUs state change + * isolated_cpus_update - Update the isolated_cpus mask * @old_prs: old partition_root_state * @new_prs: new partition_root_state * @xcpus: exclusive CPUs with state change */ -static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) +static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs == new_prs); if (new_prs == PRS_ISOLATED) @@ -1466,8 +1300,8 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, isolcpus_updated = (new_prs != parent->partition_root_state); if (isolcpus_updated) - partition_xcpus_newstate(parent->partition_root_state, new_prs, - xcpus); + isolated_cpus_update(parent->partition_root_state, new_prs, + xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); return isolcpus_updated; @@ -1497,8 +1331,8 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, isolcpus_updated = (old_prs != parent->partition_root_state); if (isolcpus_updated) - partition_xcpus_newstate(old_prs, parent->partition_root_state, - xcpus); + isolated_cpus_update(old_prs, parent->partition_root_state, + xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); @@ -1533,25 +1367,55 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); * compute_effective_exclusive_cpumask - compute effective exclusive CPUs * @cs: cpuset * @xcpus: effective exclusive CPUs value to be set - * Return: true if xcpus is not empty, false otherwise. + * @real_cs: the real cpuset (can be NULL) + * Return: 0 if there is no sibling conflict, > 0 otherwise * - * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), - * it must be a subset of cpus_allowed and parent's effective_xcpus. + * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to + * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus + * as well. The provision of real_cs means that a cpumask is being changed and + * the given cs is a trial one. */ -static bool compute_effective_exclusive_cpumask(struct cpuset *cs, - struct cpumask *xcpus) +static int compute_effective_exclusive_cpumask(struct cpuset *cs, + struct cpumask *xcpus, + struct cpuset *real_cs) { + struct cgroup_subsys_state *css; struct cpuset *parent = parent_cs(cs); + struct cpuset *sibling; + int retval = 0; if (!xcpus) xcpus = cs->effective_xcpus; - if (!cpumask_empty(cs->exclusive_cpus)) - cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); - else - cpumask_copy(xcpus, cs->cpus_allowed); + cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); + + if (!real_cs) { + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + } else { + cs = real_cs; + } + + /* + * Exclude exclusive CPUs from siblings + */ + rcu_read_lock(); + cpuset_for_each_child(sibling, css, parent) { + if (sibling == cs) + continue; - return cpumask_and(xcpus, xcpus, parent->effective_xcpus); + if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) { + cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); + retval++; + continue; + } + if (cpumask_intersects(xcpus, sibling->effective_xcpus)) { + cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); + retval++; + } + } + rcu_read_unlock(); + return retval; } static inline bool is_remote_partition(struct cpuset *cs) @@ -1568,8 +1432,8 @@ static inline bool is_local_partition(struct cpuset *cs) * remote_partition_enable - Enable current cpuset as a remote partition root * @cs: the cpuset to update * @new_prs: new partition_root_state - * @tmp: temparary masks - * Return: 1 if successful, 0 if error + * @tmp: temporary masks + * Return: 0 if successful, errcode if error * * Enable the current cpuset to become a remote partition root taking CPUs * directly from the top cpuset. cpuset_mutex must be held by the caller. @@ -1583,46 +1447,45 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * The user must have sysadmin privilege. */ if (!capable(CAP_SYS_ADMIN)) - return 0; + return PERR_ACCESS; /* * The requested exclusive_cpus must not be allocated to other * partitions and it can't use up all the root's effective_cpus. * - * Note that if there is any local partition root above it or - * remote partition root underneath it, its exclusive_cpus must - * have overlapped with subpartitions_cpus. + * The effective_xcpus mask can contain offline CPUs, but there must + * be at least one or more online CPUs present before it can be enabled. + * + * Note that creating a remote partition with any local partition root + * above it or remote partition root underneath it is not allowed. */ - compute_effective_exclusive_cpumask(cs, tmp->new_cpus); - if (cpumask_empty(tmp->new_cpus) || - cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || + compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); + WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); + if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) - return 0; + return PERR_INVCPUS; spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); - - cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } + cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + cpuset_force_rebuild(); + cs->prs_err = 0; /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); - return 1; + return 0; } /* * remote_partition_disable - Remove current cpuset from remote partition list * @cs: the cpuset to update - * @tmp: temparary masks + * @tmp: temporary masks * * The effective_cpus is also updated. * @@ -1632,39 +1495,44 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { bool isolcpus_updated; - compute_effective_exclusive_cpumask(cs, tmp->new_cpus); WARN_ON_ONCE(!is_remote_partition(cs)); - WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); list_del_init(&cs->remote_sibling); isolcpus_updated = partition_xcpus_del(cs->partition_root_state, - NULL, tmp->new_cpus); - cs->partition_root_state = -cs->partition_root_state; - if (!cs->prs_err) - cs->prs_err = PERR_INVCPUS; + NULL, cs->effective_xcpus); + if (cs->prs_err) + cs->partition_root_state = -cs->partition_root_state; + else + cs->partition_root_state = PRS_MEMBER; + + /* effective_xcpus may need to be changed */ + compute_effective_exclusive_cpumask(cs, NULL, NULL); reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + cpuset_force_rebuild(); /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); } /* * remote_cpus_update - cpus_exclusive change of remote partition * @cs: the cpuset to be updated - * @newmask: the new effective_xcpus mask - * @tmp: temparary masks + * @xcpus: the new exclusive_cpus mask, if non-NULL + * @excpus: the new effective_xcpus mask + * @tmp: temporary masks * * top_cpuset and subpartitions_cpus will be updated or partition can be * invalidated. */ -static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, - struct tmpmasks *tmp) +static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, + struct cpumask *excpus, struct tmpmasks *tmp) { bool adding, deleting; int prs = cs->partition_root_state; @@ -1675,34 +1543,51 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); - if (cpumask_empty(newmask)) + if (cpumask_empty(excpus)) { + cs->prs_err = PERR_CPUSEMPTY; goto invalidate; + } - adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); - deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); + adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus); + deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus); /* * Additions of remote CPUs is only allowed if those CPUs are * not allocated to other partitions and there are effective_cpus * left in the top cpuset. */ - if (adding && (!capable(CAP_SYS_ADMIN) || - cpumask_intersects(tmp->addmask, subpartitions_cpus) || - cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) - goto invalidate; + if (adding) { + WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus)); + if (!capable(CAP_SYS_ADMIN)) + cs->prs_err = PERR_ACCESS; + else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) + cs->prs_err = PERR_NOCPUS; + if (cs->prs_err) + goto invalidate; + } spin_lock_irq(&callback_lock); if (adding) isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); + /* + * Need to update effective_xcpus and exclusive_cpus now as + * update_sibling_cpumasks() below may iterate back to the same cs. + */ + cpumask_copy(cs->effective_xcpus, excpus); + if (xcpus) + cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + if (adding || deleting) + cpuset_force_rebuild(); /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return; @@ -1711,61 +1596,20 @@ invalidate: } /* - * remote_partition_check - check if a child remote partition needs update - * @cs: the cpuset to be updated - * @newmask: the new effective_xcpus mask - * @delmask: temporary mask for deletion (not in tmp) - * @tmp: temparary masks - * - * This should be called before the given cs has updated its cpus_allowed - * and/or effective_xcpus. - */ -static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, - struct cpumask *delmask, struct tmpmasks *tmp) -{ - struct cpuset *child, *next; - int disable_cnt = 0; - - /* - * Compute the effective exclusive CPUs that will be deleted. - */ - if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || - !cpumask_intersects(delmask, subpartitions_cpus)) - return; /* No deletion of exclusive CPUs in partitions */ - - /* - * Searching the remote children list to look for those that will - * be impacted by the deletion of exclusive CPUs. - * - * Since a cpuset must be removed from the remote children list - * before it can go offline and holding cpuset_mutex will prevent - * any change in cpuset status. RCU read lock isn't needed. - */ - lockdep_assert_held(&cpuset_mutex); - list_for_each_entry_safe(child, next, &remote_children, remote_sibling) - if (cpumask_intersects(child->effective_cpus, delmask)) { - remote_partition_disable(child, tmp); - disable_cnt++; - } - if (disable_cnt) - rebuild_sched_domains_locked(); -} - -/* * prstate_housekeeping_conflict - check for partition & housekeeping conflicts * @prstate: partition root state to be checked * @new_cpus: cpu mask * Return: true if there is conflict, false otherwise * - * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in - * an isolated partition. + * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * isolated partition. */ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) { - const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN); - bool all_in_hk = cpumask_subset(new_cpus, hk_domain); + if (!have_boot_isolcpus) + return false; - if (!all_in_hk && (prstate != PRS_ISOLATED)) + if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) return true; return false; @@ -1805,7 +1649,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) * The partcmd_update command is used by update_cpumasks_hier() with newmask * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used * by update_cpumask() with NULL newmask. In both cases, the callers won't - * check for error and so partition_root_state and prs_error will be updated + * check for error and so partition_root_state and prs_err will be updated * directly. */ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, @@ -1818,11 +1662,12 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ int subparts_delta = 0; - struct cpumask *xcpus; /* cs effective_xcpus */ int isolcpus_updated = 0; + struct cpumask *xcpus = user_xcpus(cs); bool nocpu; lockdep_assert_held(&cpuset_mutex); + WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* * new_prs will only be changed for the partcmd_update and @@ -1830,8 +1675,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; - xcpus = !cpumask_empty(cs->exclusive_cpus) - ? cs->effective_xcpus : cs->cpus_allowed; if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs)) @@ -1859,19 +1702,26 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } - if (!newmask && cpumask_empty(cs->cpus_allowed)) + if (!newmask && xcpus_empty(cs)) return PERR_CPUSEMPTY; nocpu = tasks_nocpu_error(parent, cs, xcpus); if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* + * Need to call compute_effective_exclusive_cpumask() in case + * exclusive_cpus not set. Sibling conflict should only happen + * if exclusive_cpus isn't set. + */ + xcpus = tmp->delmask; + if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) + WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + + /* * Enabling partition root is not allowed if its - * effective_xcpus is empty or doesn't overlap with - * parent's effective_xcpus. + * effective_xcpus is empty. */ - if (cpumask_empty(xcpus) || - !cpumask_intersects(xcpus, parent->effective_xcpus)) + if (cpumask_empty(xcpus)) return PERR_INVCPUS; if (prstate_housekeeping_conflict(new_prs, xcpus)) @@ -1884,19 +1734,33 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (nocpu) return PERR_NOCPUS; - cpumask_copy(tmp->delmask, xcpus); + /* + * This function will only be called when all the preliminary + * checks have passed. At this point, the following condition + * should hold. + * + * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus + * + * Warn if it is not the case. + */ + cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); + deleting = true; subparts_delta++; new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* - * May need to add cpus to parent's effective_cpus for - * valid partition root. + * May need to add cpus back to parent's effective_cpus + * (and maybe removed from subpartitions_cpus/isolated_cpus) + * for valid partition root. xcpus may contain CPUs that + * shouldn't be removed from the two global cpumasks. */ - adding = !is_prs_invalid(old_prs) && - cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - if (adding) + if (is_partition_valid(cs)) { + cpumask_copy(tmp->addmask, cs->effective_xcpus); + adding = true; subparts_delta--; + } new_prs = PRS_MEMBER; } else if (newmask) { /* @@ -1907,6 +1771,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, goto write_error; } + /* Check newmask again, whether cpus are available for parent/cs */ + nocpu |= tasks_nocpu_error(parent, cs, newmask); + /* * partcmd_update with newmask: * @@ -1935,6 +1802,15 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, parent->effective_xcpus); } /* + * The new CPUs to be removed from parent's effective CPUs + * must be present. + */ + if (deleting) { + cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); + } + + /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ @@ -2032,7 +1908,7 @@ write_error: * CPU lists in cs haven't been updated yet. So defer it to later. */ if ((old_prs != new_prs) && (cmd != partcmd_update)) { - int err = update_partition_exclusive(cs, new_prs); + int err = update_partition_exclusive_flag(cs, new_prs); if (err) return err; @@ -2070,23 +1946,20 @@ write_error: update_unbound_workqueue_cpumask(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) - update_partition_exclusive(cs, new_prs); + update_partition_exclusive_flag(cs, new_prs); if (adding || deleting) { - update_tasks_cpumask(parent, tmp->addmask); + cpuset_update_tasks_cpumask(parent, tmp->addmask); update_sibling_cpumasks(parent, cs, tmp); } /* * For partcmd_update without newmask, it is being called from - * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. - * Update the load balance flag and scheduling domain if - * cpus_read_trylock() is successful. + * cpuset_handle_hotplug(). Update the load balance flag and + * scheduling domain accordingly. */ - if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { + if ((cmd == partcmd_update) && !newmask) update_partition_sd_lb(cs, old_prs); - cpus_read_unlock(); - } notify_partition_change(cs, old_prs); return 0; @@ -2123,7 +1996,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * 2) All the effective_cpus will be used up and cp * has tasks */ - compute_effective_exclusive_cpumask(cs, new_ecpus); + compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); @@ -2131,6 +2004,11 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, if (!is_partition_valid(child)) continue; + /* + * There shouldn't be a remote partition underneath another + * partition root. + */ + WARN_ON_ONCE(is_remote_partition(child)); child->prs_err = 0; if (!cpumask_subset(child->effective_xcpus, cs->effective_xcpus)) @@ -2160,12 +2038,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, } /* - * update_cpumasks_hier() flags - */ -#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ -#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ - -/* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup @@ -2179,7 +2051,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, - int flags) + bool force) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -2192,32 +2064,39 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, bool remote = is_remote_partition(cp); bool update_parent = false; + old_prs = new_prs = cp->partition_root_state; + /* - * Skip descendent remote partition that acquires CPUs - * directly from top cpuset unless it is cs. + * For child remote partition root (!= cs), we need to call + * remote_cpus_update() if effective_xcpus will be changed. + * Otherwise, we can skip the whole subtree. + * + * remote_cpus_update() will reuse tmp->new_cpus only after + * its value is being processed. */ if (remote && (cp != cs)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); + if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + rcu_read_unlock(); + remote_cpus_update(cp, NULL, tmp->new_cpus, tmp); + rcu_read_lock(); - /* - * Update effective_xcpus if exclusive_cpus set. - * The case when exclusive_cpus isn't set is handled later. - */ - if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) { - spin_lock_irq(&callback_lock); - compute_effective_exclusive_cpumask(cp, NULL); - spin_unlock_irq(&callback_lock); + /* Remote partition may be invalidated */ + new_prs = cp->partition_root_state; + remote = (new_prs == old_prs); } - old_prs = new_prs = cp->partition_root_state; - if (remote || (is_partition_valid(parent) && - is_partition_valid(cp))) + if (remote || (is_partition_valid(parent) && is_partition_valid(cp))) compute_partition_effective_cpumask(cp, tmp->new_cpus); else compute_effective_cpumask(tmp->new_cpus, cp, parent); + if (remote) + goto get_css; /* Ready to update cpuset data */ + /* * A partition with no effective_cpus is allowed as long as * there is no task associated with it. Call @@ -2234,31 +2113,19 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * it is a partition root that has explicitly distributed * out all its CPUs. */ - if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { + if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); - if (!cp->use_parent_ecpus) { - cp->use_parent_ecpus = true; - parent->child_ecpus_count++; - } - } else if (cp->use_parent_ecpus) { - cp->use_parent_ecpus = false; - WARN_ON_ONCE(!parent->child_ecpus_count); - parent->child_ecpus_count--; - } - - if (remote) - goto get_css; /* * Skip the whole subtree if * 1) the cpumask remains the same, * 2) has no partition root state, - * 3) HIER_CHECKALL flag not set, and + * 3) force flag not set, and * 4) for v2 load balance state same as its parent. */ - if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && + if (!cp->partition_root_state && !force && cpumask_equal(tmp->new_cpus, cp->effective_cpus) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (!cpuset_v2() || (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -2268,7 +2135,7 @@ update_parent_effective: /* * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call - * update_tasks_cpumask() again for tasks in the parent + * cpuset_update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's effective_cpus changes. */ if ((cp != cs) && old_prs) { @@ -2309,6 +2176,9 @@ get_css: spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; + if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) + compute_effective_exclusive_cpumask(cp, NULL, NULL); + /* * Make sure effective_xcpus is properly set for a valid * partition root. @@ -2325,15 +2195,14 @@ get_css: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp, cp->effective_cpus); + cpuset_update_tasks_cpumask(cp, cp->effective_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE * from parent if current cpuset isn't a valid partition root * and their load balance states differ. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_partition_valid(cp) && + if (cpuset_v2() && !is_partition_valid(cp) && (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { if (is_sched_load_balance(parent)) set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); @@ -2349,8 +2218,7 @@ get_css: */ if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - is_partition_valid(cp))) + (!cpuset_v2() || is_partition_valid(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); @@ -2358,8 +2226,8 @@ get_css: } rcu_read_unlock(); - if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) - rebuild_sched_domains_locked(); + if (need_rebuild_sched_domains) + cpuset_force_rebuild(); } /** @@ -2380,33 +2248,36 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * Check all its siblings and call update_cpumasks_hier() * if their effective_cpus will need to be changed. * - * With the addition of effective_xcpus which is a subset of - * cpus_allowed. It is possible a change in parent's effective_cpus + * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus * directly. * * The update_cpumasks_hier() function may sleep. So we have to - * release the RCU read lock before calling it. HIER_NO_SD_REBUILD - * flag is used to suppress rebuild of sched domains as the callers - * will take care of that. + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs) continue; - if (!sibling->use_parent_ecpus && - !is_partition_valid(sibling)) { + if (!is_partition_valid(sibling)) { compute_effective_cpumask(tmp->new_cpus, sibling, parent); if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; + } else if (is_remote_partition(sibling)) { + /* + * Change in a sibling cpuset won't affect a remote + * partition root. + */ + continue; } + if (!css_tryget_online(&sibling->css)) continue; rcu_read_unlock(); - update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); + update_cpumasks_hier(sibling, tmp, false); rcu_read_lock(); css_put(&sibling->css); } @@ -2426,10 +2297,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; - int hier_flags = 0; + bool force = false; int old_prs = cs->partition_root_state; - /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ + /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */ if (cs == &top_cpuset) return -EACCES; @@ -2441,7 +2312,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ if (!*buf) { cpumask_clear(trialcs->cpus_allowed); - cpumask_clear(trialcs->effective_xcpus); + if (cpumask_empty(trialcs->exclusive_cpus)) + cpumask_clear(trialcs->effective_xcpus); } else { retval = cpulist_parse(buf, trialcs->cpus_allowed); if (retval < 0) @@ -2452,13 +2324,14 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return -EINVAL; /* - * When exclusive_cpus isn't explicitly set, it is constrainted + * When exclusive_cpus isn't explicitly set, it is constrained * by cpus_allowed and parent's effective_xcpus. Otherwise, * trialcs->effective_xcpus is used as a temporary cpumask * for checking validity of the partition root. */ + trialcs->partition_root_state = PRS_MEMBER; if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) - compute_effective_exclusive_cpumask(trialcs, NULL); + compute_effective_exclusive_cpumask(trialcs, NULL, cs); } /* Nothing to do if the cpus didn't change */ @@ -2486,12 +2359,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ - if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) - hier_flags = HIER_CHECKALL; + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); - if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if ((retval == -EINVAL) && cpuset_v2()) { struct cgroup_subsys_state *css; struct cpuset *cp; @@ -2505,7 +2377,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, invalidate = true; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = fetch_xcpus(trialcs); + struct cpumask *xcpus = user_xcpus(trialcs); if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { @@ -2532,19 +2404,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Call remote_cpus_update() to handle valid remote partition */ if (is_remote_partition(cs)) - remote_cpus_update(cs, xcpus, &tmp); + remote_cpus_update(cs, NULL, xcpus, &tmp); else if (invalidate) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); else update_parent_effective_cpumask(cs, partcmd_update, xcpus, &tmp); - } else if (!cpumask_empty(cs->exclusive_cpus)) { - /* - * Use trialcs->effective_cpus as a temp cpumask - */ - remote_partition_check(cs, trialcs->effective_xcpus, - trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); @@ -2555,7 +2421,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_unlock_irq(&callback_lock); /* effective_cpus/effective_xcpus will be updated here */ - update_cpumasks_hier(cs, &tmp, hier_flags); + update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) @@ -2580,7 +2446,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; - int hier_flags = 0; + bool force = false; int old_prs = cs->partition_root_state; if (!*buf) { @@ -2590,23 +2456,27 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = cpulist_parse(buf, trialcs->exclusive_cpus); if (retval < 0) return retval; - if (!is_cpu_exclusive(cs)) - set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); } /* Nothing to do if the CPUs didn't change */ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (*buf) - compute_effective_exclusive_cpumask(trialcs, NULL); + if (*buf) { + trialcs->partition_root_state = PRS_MEMBER; + /* + * Reject the change if there is exclusive CPUs conflict with + * the siblings. + */ + if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) + return -EINVAL; + } /* * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ - if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) - hier_flags = HIER_CHECKALL; + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); if (retval) @@ -2631,8 +2501,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (invalidate) remote_partition_disable(cs, &tmp); else - remote_cpus_update(cs, trialcs->effective_xcpus, - &tmp); + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, &tmp); } else if (invalidate) { update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); @@ -2640,12 +2510,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, update_parent_effective_cpumask(cs, partcmd_update, trialcs->effective_xcpus, &tmp); } - } else if (!cpumask_empty(trialcs->exclusive_cpus)) { - /* - * Use trialcs->effective_cpus as a temp cpumask - */ - remote_partition_check(cs, trialcs->effective_xcpus, - trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); @@ -2659,8 +2523,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, * of the subtree when it is a valid partition root or effective_xcpus * is updated. */ - if (is_partition_valid(cs) || hier_flags) - update_cpumasks_hier(cs, &tmp, hier_flags); + if (is_partition_valid(cs) || force) + update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) @@ -2754,14 +2618,14 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, static void *cpuset_being_rebound; /** - * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. + * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ -static void update_tasks_nodemask(struct cpuset *cs) +void cpuset_update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; @@ -2859,7 +2723,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); - update_tasks_nodemask(cp); + cpuset_update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); @@ -2945,44 +2809,8 @@ bool current_cpuset_is_being_rebound(void) return ret; } -static int update_relax_domain_level(struct cpuset *cs, s64 val) -{ -#ifdef CONFIG_SMP - if (val < -1 || val >= sched_domain_level_max) - return -EINVAL; -#endif - - if (val != cs->relax_domain_level) { - cs->relax_domain_level = val; - if (!cpumask_empty(cs->cpus_allowed) && - is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); - } - - return 0; -} - -/** - * update_tasks_flags - update the spread flags of tasks in the cpuset. - * @cs: the cpuset in which each task's spread flags needs to be changed - * - * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_mutex held, cpuset membership stays - * stable. - */ -static void update_tasks_flags(struct cpuset *cs) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) - cpuset_update_task_spread_flags(cs, task); - css_task_iter_end(&it); -} - /* - * update_flag - read a 0 or a 1 in a file and update associated flag + * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared @@ -2990,7 +2818,7 @@ static void update_tasks_flags(struct cpuset *cs) * Call with cpuset_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { struct cpuset *trialcs; @@ -3021,11 +2849,15 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); - if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { + if (cpuset_v2()) + cpuset_force_rebuild(); + else + rebuild_sched_domains_locked(); + } if (spread_flag_changed) - update_tasks_flags(cs); + cpuset1_update_tasks_flags(cs); out: free_cpuset(trialcs); return err; @@ -3044,7 +2876,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; - bool new_xcpus_state = false; + bool isolcpus_updated = false; if (old_prs == new_prs) return 0; @@ -3058,45 +2890,50 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; - /* - * Setup effective_xcpus if not properly set yet, it will be cleared - * later if partition becomes invalid. - */ - if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { - spin_lock_irq(&callback_lock); - cpumask_and(cs->effective_xcpus, - cs->cpus_allowed, parent->effective_xcpus); - spin_unlock_irq(&callback_lock); - } - - err = update_partition_exclusive(cs, new_prs); + err = update_partition_exclusive_flag(cs, new_prs); if (err) goto out; if (!old_prs) { - enum partition_cmd cmd = (new_prs == PRS_ROOT) - ? partcmd_enable : partcmd_enablei; - /* - * cpus_allowed cannot be empty. + * cpus_allowed and exclusive_cpus cannot be both empty. */ - if (cpumask_empty(cs->cpus_allowed)) { + if (xcpus_empty(cs)) { err = PERR_CPUSEMPTY; goto out; } - err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); /* - * If an attempt to become local partition root fails, - * try to become a remote partition root instead. + * We don't support the creation of a new local partition with + * a remote partition underneath it. This unsupported + * setting can happen only if parent is the top_cpuset because + * a remote partition cannot be created underneath an existing + * local or remote partition. */ - if (err && remote_partition_enable(cs, new_prs, &tmpmask)) - err = 0; + if ((parent == &top_cpuset) && + cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) { + err = PERR_REMOTE; + goto out; + } + + /* + * If parent is valid partition, enable local partiion. + * Otherwise, enable a remote partition. + */ + if (is_partition_valid(parent)) { + enum partition_cmd cmd = (new_prs == PRS_ROOT) + ? partcmd_enable : partcmd_enablei; + + err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); + } else { + err = remote_partition_enable(cs, new_prs, &tmpmask); + } } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. + * Need to update isolated_cpus. */ - new_xcpus_state = true; + isolcpus_updated = true; } else { /* * Switching back to member is always allowed even if it @@ -3120,7 +2957,7 @@ out: */ if (err) { new_prs = -new_prs; - update_partition_exclusive(cs, new_prs); + update_partition_exclusive_flag(cs, new_prs); } spin_lock_irq(&callback_lock); @@ -3128,123 +2965,28 @@ out: WRITE_ONCE(cs->prs_err, err); if (!is_partition_valid(cs)) reset_partition_data(cs); - else if (new_xcpus_state) - partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); + else if (isolcpus_updated) + isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(new_xcpus_state); + update_unbound_workqueue_cpumask(isolcpus_updated); - /* Force update if switching back to member */ - update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); + /* Force update if switching back to member & update effective_xcpus */ + update_cpumasks_hier(cs, &tmpmask, !new_prs); + + /* A newly created partition must have effective_xcpus set */ + WARN_ON_ONCE(!old_prs && (new_prs > 0) + && cpumask_empty(cs->effective_xcpus)); /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); free_cpumasks(NULL, &tmpmask); return 0; } -/* - * Frequency meter - How fast is some event occurring? - * - * These routines manage a digitally filtered, constant time based, - * event frequency meter. There are four routines: - * fmeter_init() - initialize a frequency meter. - * fmeter_markevent() - called each time the event happens. - * fmeter_getrate() - returns the recent rate of such events. - * fmeter_update() - internal routine used to update fmeter. - * - * A common data structure is passed to each of these routines, - * which is used to keep track of the state required to manage the - * frequency meter and its digital filter. - * - * The filter works on the number of events marked per unit time. - * The filter is single-pole low-pass recursive (IIR). The time unit - * is 1 second. Arithmetic is done using 32-bit integers scaled to - * simulate 3 decimal digits of precision (multiplied by 1000). - * - * With an FM_COEF of 933, and a time base of 1 second, the filter - * has a half-life of 10 seconds, meaning that if the events quit - * happening, then the rate returned from the fmeter_getrate() - * will be cut in half each 10 seconds, until it converges to zero. - * - * It is not worth doing a real infinitely recursive filter. If more - * than FM_MAXTICKS ticks have elapsed since the last filter event, - * just compute FM_MAXTICKS ticks worth, by which point the level - * will be stable. - * - * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid - * arithmetic overflow in the fmeter_update() routine. - * - * Given the simple 32 bit integer arithmetic used, this meter works - * best for reporting rates between one per millisecond (msec) and - * one per 32 (approx) seconds. At constant rates faster than one - * per msec it maxes out at values just under 1,000,000. At constant - * rates between one per msec, and one per second it will stabilize - * to a value N*1000, where N is the rate of events per second. - * At constant rates between one per second and one per 32 seconds, - * it will be choppy, moving up on the seconds that have an event, - * and then decaying until the next event. At rates slower than - * about one in 32 seconds, it decays all the way back to zero between - * each event. - */ - -#define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ -#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ -#define FM_SCALE 1000 /* faux fixed point scale */ - -/* Initialize a frequency meter */ -static void fmeter_init(struct fmeter *fmp) -{ - fmp->cnt = 0; - fmp->val = 0; - fmp->time = 0; - spin_lock_init(&fmp->lock); -} - -/* Internal meter update - process cnt events and update value */ -static void fmeter_update(struct fmeter *fmp) -{ - time64_t now; - u32 ticks; - - now = ktime_get_seconds(); - ticks = now - fmp->time; - - if (ticks == 0) - return; - - ticks = min(FM_MAXTICKS, ticks); - while (ticks-- > 0) - fmp->val = (FM_COEF * fmp->val) / FM_SCALE; - fmp->time = now; - - fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; - fmp->cnt = 0; -} - -/* Process any previous ticks, then bump cnt by one (times scale). */ -static void fmeter_markevent(struct fmeter *fmp) -{ - spin_lock(&fmp->lock); - fmeter_update(fmp); - fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); - spin_unlock(&fmp->lock); -} - -/* Process any previous ticks, then return current value. */ -static int fmeter_getrate(struct fmeter *fmp) -{ - int val; - - spin_lock(&fmp->lock); - fmeter_update(fmp); - val = fmp->val; - spin_unlock(&fmp->lock); - return val; -} - static struct cpuset *cpuset_attach_old_cs; /* @@ -3301,8 +3043,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * migration permission derives from hierarchy ownership in * cgroup_procs_write_permission()). */ - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - (cpus_updated || mems_updated)) { + if (!cpuset_v2() || (cpus_updated || mems_updated)) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; @@ -3353,9 +3094,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cs = css_cs(css); mutex_lock(&cpuset_mutex); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); + dec_attach_in_progress_locked(cs); if (cs->nr_migrate_dl_tasks) { int cpu = cpumask_any(cs->effective_cpus); @@ -3380,7 +3119,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) lockdep_assert_held(&cpuset_mutex); if (cs != &top_cpuset) - guarantee_online_cpus(task, cpus_attach); + guarantee_active_cpus(task, cpus_attach); else cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), subpartitions_cpus); @@ -3391,7 +3130,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flags(cs, task); + cpuset1_update_task_spread_flags(cs, task); } static void cpuset_attach(struct cgroup_taskset *tset) @@ -3418,8 +3157,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) * in effective cpus and mems. In that case, we can optimize out * by skipping the task iteration and update. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !cpus_updated && !mems_updated) { + if (cpuset_v2() && !cpus_updated && !mems_updated) { cpuset_attach_nodemask_to = cs->effective_mems; goto out; } @@ -3470,116 +3208,15 @@ out: reset_migrate_dl_data(cs); } - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); + dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } -/* The various types of files and directories in a cpuset file system */ - -typedef enum { - FILE_MEMORY_MIGRATE, - FILE_CPULIST, - FILE_MEMLIST, - FILE_EFFECTIVE_CPULIST, - FILE_EFFECTIVE_MEMLIST, - FILE_SUBPARTS_CPULIST, - FILE_EXCLUSIVE_CPULIST, - FILE_EFFECTIVE_XCPULIST, - FILE_ISOLATED_CPULIST, - FILE_CPU_EXCLUSIVE, - FILE_MEM_EXCLUSIVE, - FILE_MEM_HARDWALL, - FILE_SCHED_LOAD_BALANCE, - FILE_PARTITION_ROOT, - FILE_SCHED_RELAX_DOMAIN_LEVEL, - FILE_MEMORY_PRESSURE_ENABLED, - FILE_MEMORY_PRESSURE, - FILE_SPREAD_PAGE, - FILE_SPREAD_SLAB, -} cpuset_filetype_t; - -static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = 0; - - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) { - retval = -ENODEV; - goto out_unlock; - } - - switch (type) { - case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); - break; - case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); - break; - case FILE_MEM_HARDWALL: - retval = update_flag(CS_MEM_HARDWALL, cs, val); - break; - case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); - break; - case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, val); - break; - case FILE_MEMORY_PRESSURE_ENABLED: - cpuset_memory_pressure_enabled = !!val; - break; - case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, val); - break; - case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - return retval; -} - -static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, - s64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = -ENODEV; - - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) - goto out_unlock; - - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - retval = update_relax_domain_level(cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - return retval; -} - /* * Common handling for a write to a "cpus" or "mems" file. */ -static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); @@ -3587,30 +3224,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, int retval = -ENODEV; buf = strstrip(buf); - - /* - * CPU or memory hotunplug may leave @cs w/o any execution - * resources, in which case the hotplug code asynchronously updates - * configuration and transfers all tasks to the nearest ancestor - * which can execute. - * - * As writes to "cpus" or "mems" may restore @cs's execution - * resources, wait for the previously scheduled operations before - * proceeding, so that we don't end up keep removing tasks added - * after execution capability is restored. - * - * cpuset_hotplug_work calls back into cgroup core via - * cgroup_transfer_tasks() and waiting for it from a cgroupfs - * operation like this one can lead to a deadlock through kernfs - * active_ref protection. Let's break the protection. Losing the - * protection is okay as we check whether @cs is online after - * grabbing cpuset_mutex anyway. This only happens on the legacy - * hierarchies. - */ - css_get(&cs->css); - kernfs_break_active_protection(of->kn); - flush_work(&cpuset_hotplug_work); - cpus_read_lock(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) @@ -3638,11 +3251,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, } free_cpuset(trialcs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); - kernfs_unbreak_active_protection(of->kn); - css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -3655,7 +3268,7 @@ out_unlock: * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. */ -static int cpuset_common_seq_show(struct seq_file *sf, void *v) +int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; @@ -3696,53 +3309,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) return ret; } -static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_CPU_EXCLUSIVE: - return is_cpu_exclusive(cs); - case FILE_MEM_EXCLUSIVE: - return is_mem_exclusive(cs); - case FILE_MEM_HARDWALL: - return is_mem_hardwall(cs); - case FILE_SCHED_LOAD_BALANCE: - return is_sched_load_balance(cs); - case FILE_MEMORY_MIGRATE: - return is_memory_migrate(cs); - case FILE_MEMORY_PRESSURE_ENABLED: - return cpuset_memory_pressure_enabled; - case FILE_MEMORY_PRESSURE: - return fmeter_getrate(&cs->fmeter); - case FILE_SPREAD_PAGE: - return is_spread_page(cs); - case FILE_SPREAD_SLAB: - return is_spread_slab(cs); - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - -static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - return cs->relax_domain_level; - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - -static int sched_partition_show(struct seq_file *seq, void *v) +static int cpuset_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); const char *err, *type = NULL; @@ -3773,7 +3340,7 @@ static int sched_partition_show(struct seq_file *seq, void *v) return 0; } -static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, +static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); @@ -3782,9 +3349,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, buf = strstrip(buf); - /* - * Convert "root" to ENABLED, and convert "member" to DISABLED. - */ if (!strcmp(buf, "root")) val = PRS_ROOT; else if (!strcmp(buf, "member")) @@ -3797,11 +3361,8 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, css_get(&cs->css); cpus_read_lock(); mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) - goto out_unlock; - - retval = update_prstate(cs, val); -out_unlock: + if (is_cpuset_online(cs)) + retval = update_prstate(cs, val); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); css_put(&cs->css); @@ -3809,113 +3370,6 @@ out_unlock: } /* - * for the common functions, 'private' gives the type of file - */ - -static struct cftype legacy_files[] = { - { - .name = "cpus", - .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * NR_CPUS), - .private = FILE_CPULIST, - }, - - { - .name = "mems", - .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * MAX_NUMNODES), - .private = FILE_MEMLIST, - }, - - { - .name = "effective_cpus", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_CPULIST, - }, - - { - .name = "effective_mems", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_MEMLIST, - }, - - { - .name = "cpu_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_CPU_EXCLUSIVE, - }, - - { - .name = "mem_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_EXCLUSIVE, - }, - - { - .name = "mem_hardwall", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_HARDWALL, - }, - - { - .name = "sched_load_balance", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SCHED_LOAD_BALANCE, - }, - - { - .name = "sched_relax_domain_level", - .read_s64 = cpuset_read_s64, - .write_s64 = cpuset_write_s64, - .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, - }, - - { - .name = "memory_migrate", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_MIGRATE, - }, - - { - .name = "memory_pressure", - .read_u64 = cpuset_read_u64, - .private = FILE_MEMORY_PRESSURE, - }, - - { - .name = "memory_spread_page", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_PAGE, - }, - - { - /* obsolete, may be removed in the future */ - .name = "memory_spread_slab", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_SLAB, - }, - - { - .name = "memory_pressure_enabled", - .flags = CFTYPE_ONLY_ON_ROOT, - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE_ENABLED, - }, - - { } /* terminate */ -}; - -/* * This is currently a minimal set for the default hierarchy. It can be * expanded later on by migrating more features and control files from v1. */ @@ -3952,8 +3406,8 @@ static struct cftype dfl_files[] = { { .name = "cpus.partition", - .seq_show = sched_partition_show, - .write = sched_partition_write, + .seq_show = cpuset_partition_show, + .write = cpuset_partition_write, .private = FILE_PARTITION_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cpuset, partition_file), @@ -4020,14 +3474,12 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) } __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - nodes_clear(cs->mems_allowed); - nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (cpuset_v2()) __set_bit(CS_MEMORY_MIGRATE, &cs->flags); return &cs->css; @@ -4051,6 +3503,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); + /* + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (cpuset_v2() && !is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); @@ -4058,22 +3515,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; - cs->use_parent_ecpus = true; - parent->child_ecpus_count++; - /* - * Clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (!is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } - - /* - * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) @@ -4119,11 +3561,7 @@ out_unlock: * will call rebuild_sched_domains_locked(). That is not needed * in the default hierarchy where only changes in partition * will cause repartitioning. - * - * If the cpuset has the 'sched.partition' flag enabled, simulate - * turning 'sched.partition" off. */ - static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); @@ -4131,25 +3569,35 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_lock(); mutex_lock(&cpuset_mutex); - if (is_partition_valid(cs)) - update_prstate(cs, 0); + if (!cpuset_v2() && is_sched_load_balance(cs)) + cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + cpuset_dec(); + clear_bit(CS_ONLINE, &cs->flags); - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} - cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } +/* + * If a dying cpuset has the 'cpus.partition' enabled, turn it off by + * changing it back to member to free its exclusive CPUs back to the pool to + * be used by other online cpusets. + */ +static void cpuset_css_killed(struct cgroup_subsys_state *css) +{ + struct cpuset *cs = css_cs(css); - cpuset_dec(); - clear_bit(CS_ONLINE, &cs->flags); + cpus_read_lock(); + mutex_lock(&cpuset_mutex); + + /* Reset valid partition back to member */ + if (is_partition_valid(cs)) + update_prstate(cs, PRS_MEMBER); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); + } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -4233,11 +3681,7 @@ static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) if (same_cs) return; - mutex_lock(&cpuset_mutex); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - mutex_unlock(&cpuset_mutex); + dec_attach_in_progress(cs); } /* @@ -4269,10 +3713,7 @@ static void cpuset_fork(struct task_struct *task) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - + dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } @@ -4280,6 +3721,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, + .css_killed = cpuset_css_killed, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, @@ -4289,7 +3731,9 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, - .legacy_cftypes = legacy_files, +#ifdef CONFIG_CPUSETS_V1 + .legacy_cftypes = cpuset1_files, +#endif .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, @@ -4318,78 +3762,18 @@ int __init cpuset_init(void) nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); - top_cpuset.relax_domain_level = -1; INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); - return 0; -} - -/* - * If CPU and/or memory hotplug handlers, below, unplug any CPUs - * or memory nodes, we need to walk over the cpuset hierarchy, - * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then move the tasks in the empty - * cpuset to its next-highest non-empty parent. - */ -static void remove_tasks_in_empty_cpuset(struct cpuset *cs) -{ - struct cpuset *parent; - - /* - * Find its next-highest non-empty parent, (top cpuset - * has online cpus, so can't be empty). - */ - parent = parent_cs(cs); - while (cpumask_empty(parent->cpus_allowed) || - nodes_empty(parent->mems_allowed)) - parent = parent_cs(parent); - - if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - pr_err("cpuset: failed to transfer tasks out of empty cpuset "); - pr_cont_cgroup_name(cs->css.cgroup); - pr_cont("\n"); + have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); + if (have_boot_isolcpus) { + BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); + cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); } -} - -static void -hotplug_update_tasks_legacy(struct cpuset *cs, - struct cpumask *new_cpus, nodemask_t *new_mems, - bool cpus_updated, bool mems_updated) -{ - bool is_empty; - spin_lock_irq(&callback_lock); - cpumask_copy(cs->cpus_allowed, new_cpus); - cpumask_copy(cs->effective_cpus, new_cpus); - cs->mems_allowed = *new_mems; - cs->effective_mems = *new_mems; - spin_unlock_irq(&callback_lock); - - /* - * Don't call update_tasks_cpumask() if the cpuset becomes empty, - * as the tasks will be migrated to an ancestor. - */ - if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs, new_cpus); - if (mems_updated && !nodes_empty(cs->mems_allowed)) - update_tasks_nodemask(cs); - - is_empty = cpumask_empty(cs->cpus_allowed) || - nodes_empty(cs->mems_allowed); - - /* - * Move tasks to the nearest ancestor with execution resources, - * This is full cgroup operation which will also call back into - * cpuset. Should be done outside any lock. - */ - if (is_empty) { - mutex_unlock(&cpuset_mutex); - remove_tasks_in_empty_cpuset(cs); - mutex_lock(&cpuset_mutex); - } + return 0; } static void @@ -4409,40 +3793,14 @@ hotplug_update_tasks(struct cpuset *cs, spin_unlock_irq(&callback_lock); if (cpus_updated) - update_tasks_cpumask(cs, new_cpus); + cpuset_update_tasks_cpumask(cs, new_cpus); if (mems_updated) - update_tasks_nodemask(cs); + cpuset_update_tasks_nodemask(cs); } -static bool force_rebuild; - void cpuset_force_rebuild(void) { - force_rebuild = true; -} - -/* - * Attempt to acquire a cpus_read_lock while a hotplug operation may be in - * progress. - * Return: true if successful, false otherwise - * - * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ -static bool cpuset_hotplug_cpus_read_trylock(void) -{ - int retries = 0; - - while (!cpus_read_trylock()) { - /* - * CPU hotplug still in progress. Retry 5 times - * with a 10ms wait before bailing out. - */ - if (++retries > 5) - return false; - msleep(10); - } - return true; + force_sd_rebuild = true; } /** @@ -4493,13 +3851,11 @@ retry: compute_partition_effective_cpumask(cs, &new_cpus); if (remote && cpumask_empty(&new_cpus) && - partition_is_populated(cs, NULL) && - cpuset_hotplug_cpus_read_trylock()) { + partition_is_populated(cs, NULL)) { + cs->prs_err = PERR_HOTPLUG; remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; - cpuset_force_rebuild(); - cpus_read_unlock(); } /* @@ -4519,18 +3875,8 @@ retry: else if (is_partition_valid(parent) && is_partition_invalid(cs)) partcmd = partcmd_update; - /* - * cpus_read_lock needs to be held before calling - * update_parent_effective_cpumask(). To avoid circular lock - * dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ if (partcmd >= 0) { - if (!cpuset_hotplug_cpus_read_trylock()) - goto update_tasks; - update_parent_effective_cpumask(cs, partcmd, NULL, tmp); - cpus_read_unlock(); if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); @@ -4550,7 +3896,7 @@ update_tasks: hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else - hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); unlock: @@ -4558,8 +3904,7 @@ unlock: } /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset - * @work: unused + * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -4573,8 +3918,10 @@ unlock: * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. + * + * CPU / memory hotplug is handled synchronously. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_handle_hotplug(void) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -4585,6 +3932,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; + lockdep_assert_cpus_held(); mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ @@ -4600,15 +3948,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work) !cpumask_empty(subpartitions_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); - /* - * In the rare case that hotplug removes all the cpus in - * subpartitions_cpus, we assumed that cpus are updated. - */ - if (!cpus_updated && top_cpuset.nr_subparts) - cpus_updated = true; - /* For v1, synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { + cpuset_force_rebuild(); spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); @@ -4639,7 +3981,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; spin_unlock_irq(&callback_lock); - update_tasks_nodemask(&top_cpuset); + cpuset_update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); @@ -4663,11 +4005,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rcu_read_unlock(); } - /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; - rebuild_sched_domains(); - } + /* rebuild sched domains if necessary */ + if (force_sd_rebuild) + rebuild_sched_domains_cpuslocked(); free_cpumasks(NULL, ptmp); } @@ -4679,12 +4019,7 @@ void cpuset_update_active_cpus(void) * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); } /* @@ -4695,7 +4030,7 @@ void cpuset_wait_for_hotplug(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - schedule_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); return NOTIFY_OK; } @@ -4729,7 +4064,7 @@ void __init cpuset_init_smp(void) * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_mask, even if this means going outside the + * subset of cpu_active_mask, even if this means going outside the * tasks cpuset, except when the task is in the top cpuset. **/ @@ -4743,7 +4078,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) cs = task_cs(tsk); if (cs != &top_cpuset) - guarantee_online_cpus(tsk, pmask); + guarantee_active_cpus(tsk, pmask); /* * Tasks in the top cpuset won't get update to their cpumasks * when a hotplug online/offline event happens. So we include all @@ -4757,7 +4092,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) * allowable online cpu left, we fall back to all possible cpus. */ cpumask_andnot(pmask, possible_mask, subpartitions_cpus); - if (!cpumask_intersects(pmask, cpu_online_mask)) + if (!cpumask_intersects(pmask, cpu_active_mask)) cpumask_copy(pmask, possible_mask); } @@ -4867,7 +4202,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /* - * cpuset_node_allowed - Can we allocate on a memory node? + * cpuset_current_node_allowed - Can current task allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -4906,7 +4241,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -bool cpuset_node_allowed(int node, gfp_t gfp_mask) +bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ bool allowed; /* is allocation in zone z allowed? */ @@ -4940,6 +4275,42 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask) return allowed; } +bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +{ + struct cgroup_subsys_state *css; + struct cpuset *cs; + bool allowed; + + /* + * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy + * and mems_allowed is likely to be empty even if we could get to it, + * so return true to avoid taking a global lock on the empty check. + */ + if (!cpuset_v2()) + return true; + + css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); + if (!css) + return true; + + /* + * Normally, accessing effective_mems would require the cpuset_mutex + * or callback_lock - but node_isset is atomic and the reference + * taken via cgroup_get_e_css is sufficient to protect css. + * + * Since this interface is intended for use by migration paths, we + * relax locking here to avoid taking global locks - while accepting + * there may be rare scenarios where the result may be innaccurate. + * + * Reclaim and migration are subject to these same race conditions, and + * cannot make strong isolation guarantees, so this is acceptable. + */ + cs = container_of(css, struct cpuset, css); + allowed = node_isset(nid, cs->effective_mems); + css_put(css); + return allowed; +} + /** * cpuset_spread_node() - On which node to begin search for a page * @rotor: round robin rotor @@ -4984,19 +4355,6 @@ int cpuset_mem_spread_node(void) } /** - * cpuset_slab_spread_node() - On which node to begin search for a slab page - */ -int cpuset_slab_spread_node(void) -{ - if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) - current->cpuset_slab_spread_rotor = - node_random(¤t->mems_allowed); - - return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); -} -EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); - -/** * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? * @tsk1: pointer to task_struct of some task. * @tsk2: pointer to task_struct of some other task. @@ -5034,79 +4392,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -/* - * Collection of memory_pressure is suppressed unless - * this flag is enabled by writing "1" to the special - * cpuset file 'memory_pressure_enabled' in the root cpuset. - */ - -int cpuset_memory_pressure_enabled __read_mostly; - -/* - * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. - * - * Keep a running average of the rate of synchronous (direct) - * page reclaim efforts initiated by tasks in each cpuset. - * - * This represents the rate at which some task in the cpuset - * ran low on memory on all nodes it was allowed to use, and - * had to enter the kernels page reclaim code in an effort to - * create more free memory by tossing clean pages or swapping - * or writing dirty pages. - * - * Display to user space in the per-cpuset read-only file - * "memory_pressure". Value displayed is an integer - * representing the recent rate of entry into the synchronous - * (direct) page reclaim by any task attached to the cpuset. - */ - -void __cpuset_memory_pressure_bump(void) -{ - rcu_read_lock(); - fmeter_markevent(&task_cs(current)->fmeter); - rcu_read_unlock(); -} - -#ifdef CONFIG_PROC_PID_CPUSET -/* - * proc_cpuset_show() - * - Print tasks cpuset path into seq_file. - * - Used for /proc/<pid>/cpuset. - * - No need to task_lock(tsk) on this tsk->cpuset reference, as it - * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it - * anyway. - */ -int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - char *buf; - struct cgroup_subsys_state *css; - int retval; - - retval = -ENOMEM; - buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!buf) - goto out; - - css = task_get_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); - if (retval == -E2BIG) - retval = -ENAMETOOLONG; - if (retval < 0) - goto out_free; - seq_puts(m, buf); - seq_putc(m, '\n'); - retval = 0; -out_free: - kfree(buf); -out: - return retval; -} -#endif /* CONFIG_PROC_PID_CPUSET */ - /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { |