diff options
Diffstat (limited to 'kernel/cgroup/cpuset.c')
| -rw-r--r-- | kernel/cgroup/cpuset.c | 4344 |
1 files changed, 2661 insertions, 1683 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 479743db6c37..6e6eb09b8db6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,162 +21,120 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ +#include "cpuset-internal.h" -#include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/cpuset.h> -#include <linux/err.h> -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/fs.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> -#include <linux/kmod.h> -#include <linux/list.h> #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> #include <linux/export.h> -#include <linux/mount.h> -#include <linux/namei.h> -#include <linux/pagemap.h> -#include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/deadline.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> -#include <linux/seq_file.h> #include <linux/security.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/time.h> -#include <linux/time64.h> -#include <linux/backing-dev.h> -#include <linux/sort.h> #include <linux/oom.h> #include <linux/sched/isolation.h> -#include <linux/uaccess.h> -#include <linux/atomic.h> -#include <linux/mutex.h> -#include <linux/cgroup.h> #include <linux/wait.h> +#include <linux/workqueue.h> +#include <linux/task_work.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); -/* See "Frequency meter" comments, below. */ - -struct fmeter { - int cnt; /* unprocessed events count */ - int val; /* most recent output value */ - time64_t time; /* clock (secs) when val computed */ - spinlock_t lock; /* guards read or write of above */ +/* + * There could be abnormal cpuset configurations for cpu or memory + * node binding, add this key to provide a quick low-cost judgment + * of the situation. + */ +DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); + +static const char * const perr_strings[] = { + [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", + [PERR_INVPARENT] = "Parent is an invalid partition root", + [PERR_NOTPART] = "Parent is not a partition root", + [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", + [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", + [PERR_HOTPLUG] = "No cpu available due to hotplug", + [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", + [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", + [PERR_ACCESS] = "Enable partition not permitted", + [PERR_REMOTE] = "Have remote partition underneath", }; -struct cpuset { - struct cgroup_subsys_state css; - - unsigned long flags; /* "unsigned long" so bitops work */ - - /* - * On default hierarchy: - * - * The user-configured masks can only be changed by writing to - * cpuset.cpus and cpuset.mems, and won't be limited by the - * parent masks. - * - * The effective masks is the real masks that apply to the tasks - * in the cpuset. They may be changed if the configured masks are - * changed or hotplug happens. - * - * effective_mask == configured_mask & parent's effective_mask, - * and if it ends up empty, it will inherit the parent's mask. - * - * - * On legacy hierachy: - * - * The user-configured masks are always the same with effective masks. - */ - - /* user-configured CPUs and Memory Nodes allow to tasks */ - cpumask_var_t cpus_allowed; - nodemask_t mems_allowed; - - /* effective CPUs and Memory Nodes allow to tasks */ - cpumask_var_t effective_cpus; - nodemask_t effective_mems; - - /* - * CPUs allocated to child sub-partitions (default hierarchy only) - * - CPUs granted by the parent = effective_cpus U subparts_cpus - * - effective_cpus and subparts_cpus are mutually exclusive. - * - * effective_cpus contains only onlined CPUs, but subparts_cpus - * may have offlined ones. - */ - cpumask_var_t subparts_cpus; - - /* - * This is old Memory Nodes tasks took on. - * - * - top_cpuset.old_mems_allowed is initialized to mems_allowed. - * - A new cpuset's old_mems_allowed is initialized when some - * task is moved into it. - * - old_mems_allowed is used in cpuset_migrate_mm() when we change - * cpuset.mems_allowed and have tasks' nodemask updated, and - * then old_mems_allowed is updated to mems_allowed. - */ - nodemask_t old_mems_allowed; - - struct fmeter fmeter; /* memory_pressure filter */ - - /* - * Tasks are being attached to this cpuset. Used to prevent - * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). - */ - int attach_in_progress; - - /* partition number for rebuild_sched_domains() */ - int pn; +/* + * For local partitions, update to subpartitions_cpus & isolated_cpus is done + * in update_parent_effective_cpumask(). For remote partitions, it is done in + * the remote_partition_*() and remote_cpus_update() helpers. + */ +/* + * Exclusive CPUs distributed out to local or remote sub-partitions of + * top_cpuset + */ +static cpumask_var_t subpartitions_cpus; - /* for custom sched domain */ - int relax_domain_level; +/* + * Exclusive CPUs in isolated partitions + */ +static cpumask_var_t isolated_cpus; - /* number of CPUs in subparts_cpus */ - int nr_subparts_cpus; +/* + * isolated_cpus updating flag (protected by cpuset_mutex) + * Set if isolated_cpus is going to be updated in the current + * cpuset_mutex crtical section. + */ +static bool isolated_cpus_updating; - /* partition root state */ - int partition_root_state; +/* + * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot + */ +static cpumask_var_t boot_hk_cpus; +static bool have_boot_isolcpus; - /* - * Default hierarchy only: - * use_parent_ecpus - set if using parent's effective_cpus - * child_ecpus_count - # of children with use_parent_ecpus set - */ - int use_parent_ecpus; - int child_ecpus_count; -}; +/* + * A flag to force sched domain rebuild at the end of an operation. + * It can be set in + * - update_partition_sd_lb() + * - update_cpumasks_hier() + * - cpuset_update_flag() + * - cpuset_hotplug_update_tasks() + * - cpuset_handle_hotplug() + * + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * + * Note that update_relax_domain_level() in cpuset-v1.c can still call + * rebuild_sched_domains_locked() directly without using this flag. + */ +static bool force_sd_rebuild; /* * Partition root states: * - * 0 - not a partition root - * + * 0 - member (not a partition root) * 1 - partition root - * + * 2 - partition root without load balancing (isolated) * -1 - invalid partition root - * None of the cpus in cpus_allowed can be put into the parent's - * subparts_cpus. In this case, the cpuset is not a real partition - * root anymore. However, the CPU_EXCLUSIVE bit will still be set - * and the cpuset can be restored back to a partition root if the - * parent cpuset can give more CPUs back to this child cpuset. + * -2 - invalid isolated partition root + * + * There are 2 types of partitions - local or remote. Local partitions are + * those whose parents are partition root themselves. Setting of + * cpuset.cpus.exclusive are optional in setting up local partitions. + * Remote partitions are those whose parents are not partition roots. Passing + * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor + * nodes are mandatory in creating a remote partition. + * + * For simplicity, a local partition can be created under a local or remote + * partition but a remote partition cannot have any partition root in its + * ancestor chain except the cgroup root. */ -#define PRS_DISABLED 0 -#define PRS_ENABLED 1 -#define PRS_ERROR -1 +#define PRS_MEMBER 0 +#define PRS_ROOT 1 +#define PRS_ISOLATED 2 +#define PRS_INVALID_ROOT -1 +#define PRS_INVALID_ISOLATED -2 /* * Temporary cpumasks for working with partitions that are passed among @@ -187,142 +145,96 @@ struct tmpmasks { cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; -static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct cpuset, css) : NULL; -} - -/* Retrieve the cpuset for a task */ -static inline struct cpuset *task_cs(struct task_struct *task) -{ - return css_cs(task_css(task, cpuset_cgrp_id)); -} - -static inline struct cpuset *parent_cs(struct cpuset *cs) -{ - return css_cs(cs->css.parent); -} - -#ifdef CONFIG_NUMA -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return task->mempolicy; -} -#else -static inline bool task_has_mempolicy(struct task_struct *task) +void inc_dl_tasks_cs(struct task_struct *p) { - return false; -} -#endif - - -/* bits in struct cpuset flags field */ -typedef enum { - CS_ONLINE, - CS_CPU_EXCLUSIVE, - CS_MEM_EXCLUSIVE, - CS_MEM_HARDWALL, - CS_MEMORY_MIGRATE, - CS_SCHED_LOAD_BALANCE, - CS_SPREAD_PAGE, - CS_SPREAD_SLAB, -} cpuset_flagbits_t; + struct cpuset *cs = task_cs(p); -/* convenient tests for these bits */ -static inline bool is_cpuset_online(struct cpuset *cs) -{ - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); + cs->nr_deadline_tasks++; } -static inline int is_cpu_exclusive(const struct cpuset *cs) +void dec_dl_tasks_cs(struct task_struct *p) { - return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); -} + struct cpuset *cs = task_cs(p); -static inline int is_mem_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); + cs->nr_deadline_tasks--; } -static inline int is_mem_hardwall(const struct cpuset *cs) +static inline bool is_partition_valid(const struct cpuset *cs) { - return test_bit(CS_MEM_HARDWALL, &cs->flags); + return cs->partition_root_state > 0; } -static inline int is_sched_load_balance(const struct cpuset *cs) +static inline bool is_partition_invalid(const struct cpuset *cs) { - return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + return cs->partition_root_state < 0; } -static inline int is_memory_migrate(const struct cpuset *cs) +static inline bool cs_is_member(const struct cpuset *cs) { - return test_bit(CS_MEMORY_MIGRATE, &cs->flags); + return cs->partition_root_state == PRS_MEMBER; } -static inline int is_spread_page(const struct cpuset *cs) +/* + * Callers should hold callback_lock to modify partition_root_state. + */ +static inline void make_partition_invalid(struct cpuset *cs) { - return test_bit(CS_SPREAD_PAGE, &cs->flags); + if (cs->partition_root_state > 0) + cs->partition_root_state = -cs->partition_root_state; } -static inline int is_spread_slab(const struct cpuset *cs) +/* + * Send notification event of whenever partition_root_state changes. + */ +static inline void notify_partition_change(struct cpuset *cs, int old_prs) { - return test_bit(CS_SPREAD_SLAB, &cs->flags); -} + if (old_prs == cs->partition_root_state) + return; + cgroup_file_notify(&cs->partition_file); -static inline int is_partition_root(const struct cpuset *cs) -{ - return cs->partition_root_state > 0; + /* Reset prs_err if not invalid */ + if (is_partition_valid(cs)) + WRITE_ONCE(cs->prs_err, PERR_NONE); } -static struct cpuset top_cpuset = { - .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | - (1 << CS_MEM_EXCLUSIVE)), - .partition_root_state = PRS_ENABLED, -}; - -/** - * cpuset_for_each_child - traverse online children of a cpuset - * @child_cs: loop cursor pointing to the current child - * @pos_css: used for iteration - * @parent_cs: target cpuset to walk children of - * - * Walk @child_cs through the online children of @parent_cs. Must be used - * with RCU read locked. - */ -#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ - css_for_each_child((pos_css), &(parent_cs)->css) \ - if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) - -/** - * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants - * @des_cs: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @root_cs: target cpuset to walk ancestor of +/* + * The top_cpuset is always synchronized to cpu_active_mask and we should avoid + * using cpu_online_mask as much as possible. An active CPU is always an online + * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ + * during hotplug operations. A CPU is marked active at the last stage of CPU + * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code + * will be called to update the sched domains so that the scheduler can move + * a normal task to a newly active CPU or remove tasks away from a newly + * inactivated CPU. The online bit is set much earlier in the CPU bringup + * process and cleared much later in CPU teardown. * - * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_css by calling - * css_rightmost_descendant() to skip subtree. @root_cs is included in the - * iteration and the first node to be visited. + * If cpu_online_mask is used while a hotunplug operation is happening in + * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ -#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ - css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ - if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) +static struct cpuset top_cpuset = { + .flags = BIT(CS_CPU_EXCLUSIVE) | + BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), + .partition_root_state = PRS_ROOT, + .relax_domain_level = -1, + .remote_partition = false, +}; /* * There are two global locks guarding cpuset structures - cpuset_mutex and - * callback_lock. We also require taking task_lock() when dereferencing a - * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. + * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel + * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset + * structures. Note that cpuset_mutex needs to be a mutex as it is used in + * paths that rely on priority inheritance (e.g. scheduler - on RT) for + * correctness. * * A task must hold both locks to modify cpusets. If a task holds - * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it - * is the only task able to also acquire callback_lock and be able to - * modify cpusets. It can perform various checks on the cpuset structure - * first, knowing nothing will change. It can also allocate memory while - * just holding cpuset_mutex. While it is performing these checks, various - * callback routines can briefly acquire callback_lock to query cpusets. - * Once it is ready to make the changes, it takes callback_lock, blocking - * everyone else. + * cpuset_mutex, it blocks others, ensuring that it is the only task able to + * also acquire callback_lock and be able to modify cpusets. It can perform + * various checks on the cpuset structure first, knowing nothing will change. + * It can also allocate memory while just holding cpuset_mutex. While it is + * performing these checks, various callback routines can briefly acquire + * callback_lock to query cpusets. Once it is ready to make the changes, it + * takes callback_lock, blocking everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_lock, as that would risk double tripping on callback_lock @@ -336,90 +248,199 @@ static struct cpuset top_cpuset = { * by other task, we use alloc_lock in the task_struct fields to protect * them. * - * The cpuset_common_file_read() handlers only hold callback_lock across + * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. - * - * Accessing a task's cpuset should be done in accordance with the - * guidelines for accessing subsystem state in kernel/cgroup.c */ static DEFINE_MUTEX(cpuset_mutex); + +/** + * cpuset_lock - Acquire the global cpuset mutex + * + * This locks the global cpuset mutex to prevent modifications to cpuset + * hierarchy and configurations. This helper is not enough to make modification. + */ +void cpuset_lock(void) +{ + mutex_lock(&cpuset_mutex); +} + +void cpuset_unlock(void) +{ + mutex_unlock(&cpuset_mutex); +} + +/** + * cpuset_full_lock - Acquire full protection for cpuset modification + * + * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex + * to safely modify cpuset data. + */ +void cpuset_full_lock(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +void cpuset_full_unlock(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + static DEFINE_SPINLOCK(callback_lock); +void cpuset_callback_lock_irq(void) +{ + spin_lock_irq(&callback_lock); +} + +void cpuset_callback_unlock_irq(void) +{ + spin_unlock_irq(&callback_lock); +} + static struct workqueue_struct *cpuset_migrate_mm_wq; +static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); + +static inline void check_insane_mems_config(nodemask_t *nodes) +{ + if (!cpusets_insane_config() && + movable_only_nodes(nodes)) { + static_branch_enable_cpuslocked(&cpusets_insane_config_key); + pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" + "Cpuset allocations might fail even with a lot of memory available.\n", + nodemask_pr_args(nodes)); + } +} + /* - * CPU / memory hotplug is handled asynchronously. + * decrease cs->attach_in_progress. + * wake_up cpuset_attach_wq if cs->attach_in_progress==0. */ -static void cpuset_hotplug_workfn(struct work_struct *work); -static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); +static inline void dec_attach_in_progress_locked(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); -static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +} + +static inline void dec_attach_in_progress(struct cpuset *cs) +{ + mutex_lock(&cpuset_mutex); + dec_attach_in_progress_locked(cs); + mutex_unlock(&cpuset_mutex); +} + +static inline bool cpuset_v2(void) +{ + return !IS_ENABLED(CONFIG_CPUSETS_V1) || + cgroup_subsys_on_dfl(cpuset_cgrp_subsys); +} /* - * Cgroup v2 behavior is used when on default hierarchy or the - * cgroup_v2_mode flag is set. + * Cgroup v2 behavior is used on the "cpus" and "mems" control files when + * on default hierarchy or when the cpuset_v2_mode flag is set by mounting + * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. + * With v2 behavior, "cpus" and "mems" are always what the users have + * requested and won't be changed by hotplug events. Only the effective + * cpus or mems will be affected. */ static inline bool is_in_v2_mode(void) { - return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + return cpuset_v2() || (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } -/* - * This is ugly, but preserves the userspace API for existing cpuset - * users. If someone tries to mount the "cpuset" filesystem, we - * silently switch it to mount "cgroup" instead - */ -static struct dentry *cpuset_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, void *data) -{ - struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - struct dentry *ret = ERR_PTR(-ENODEV); - if (cgroup_fs) { - char mountopts[] = - "cpuset,noprefix," - "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->mount(cgroup_fs, flags, - unused_dev_name, mountopts); - put_filesystem(cgroup_fs); - } - return ret; +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; } -static struct file_system_type cpuset_fs_type = { - .name = "cpuset", - .mount = cpuset_mount, -}; +/** + * partition_is_populated - check if partition has tasks + * @cs: partition root to be checked + * @excluded_child: a child cpuset to be excluded in task checking + * Return: true if there are tasks, false otherwise + * + * @cs should be a valid partition root or going to become a partition root. + * @excluded_child should be non-NULL when this cpuset is going to become a + * partition itself. + * + * Note that a remote partition is not allowed underneath a valid local + * or remote partition. So if a non-partition root child is populated, + * the whole partition is considered populated. + */ +static inline bool partition_is_populated(struct cpuset *cs, + struct cpuset *excluded_child) +{ + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; + + /* + * We cannot call cs_is_populated(cs) directly, as + * nr_populated_domain_children may include populated + * csets from descendants that are partitions. + */ + if (cs->css.cgroup->nr_populated_csets || + cs->attach_in_progress) + return true; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + if (cp == cs || cp == excluded_child) + continue; + + if (is_partition_valid(cp)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + if (cpuset_is_populated(cp)) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + return false; +} /* - * Return in pmask the portion of a cpusets's cpus_allowed that - * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. + * Return in pmask the portion of a task's cpusets's cpus_allowed that + * are online and are capable of running the task. If none are found, + * walk up the cpuset hierarchy until we find one that does have some + * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_mask. + * of cpu_active_mask. * * Call with callback_lock or cpuset_mutex held. */ -static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) +static void guarantee_active_cpus(struct task_struct *tsk, + struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + struct cpuset *cs; + + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) + cpumask_copy(pmask, cpu_active_mask); + + rcu_read_lock(); + cs = task_cs(tsk); + + while (!cpumask_intersects(cs->effective_cpus, pmask)) cs = parent_cs(cs); - if (unlikely(!cs)) { - /* - * The top cpuset doesn't have any online cpu as a - * consequence of a race between cpuset_hotplug_work - * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to to be - * identical to cpu_online_mask. - */ - cpumask_copy(pmask, cpu_online_mask); - return; - } - } - cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); + + cpumask_and(pmask, pmask, cs->effective_cpus); + rcu_read_unlock(); } /* @@ -440,119 +461,105 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } -/* - * update task's spread flag if cpuset's page/slab spread flag is set +/** + * alloc_cpumasks - Allocate an array of cpumask variables + * @pmasks: Pointer to array of cpumask_var_t pointers + * @size: Number of cpumasks to allocate + * Return: 0 if successful, -ENOMEM otherwise. * - * Call with callback_lock or cpuset_mutex held. + * Allocates @size cpumasks and initializes them to empty. Returns 0 on + * success, -ENOMEM on allocation failure. On failure, any previously + * allocated cpumasks are freed. */ -static void cpuset_update_task_spread_flag(struct cpuset *cs, - struct task_struct *tsk) +static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) { - if (is_spread_page(cs)) - task_set_spread_page(tsk); - else - task_clear_spread_page(tsk); - - if (is_spread_slab(cs)) - task_set_spread_slab(tsk); - else - task_clear_spread_slab(tsk); -} + int i; -/* - * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? - * - * One cpuset is a subset of another if all its allowed CPUs and - * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_mutex. - */ - -static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) -{ - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && - nodes_subset(p->mems_allowed, q->mems_allowed) && - is_cpu_exclusive(p) <= is_cpu_exclusive(q) && - is_mem_exclusive(p) <= is_mem_exclusive(q); + for (i = 0; i < size; i++) { + if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) { + while (--i >= 0) + free_cpumask_var(*pmasks[i]); + return -ENOMEM; + } + } + return 0; } /** - * alloc_cpumasks - allocate three cpumasks for cpuset - * @cs: the cpuset that have cpumasks to be allocated. - * @tmp: the tmpmasks structure pointer - * Return: 0 if successful, -ENOMEM otherwise. - * - * Only one of the two input arguments should be non-NULL. + * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. + * @tmp: Pointer to tmpmasks structure to populate + * Return: 0 on success, -ENOMEM on allocation failure */ -static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline int alloc_tmpmasks(struct tmpmasks *tmp) { - cpumask_var_t *pmask1, *pmask2, *pmask3; - - if (cs) { - pmask1 = &cs->cpus_allowed; - pmask2 = &cs->effective_cpus; - pmask3 = &cs->subparts_cpus; - } else { - pmask1 = &tmp->new_cpus; - pmask2 = &tmp->addmask; - pmask3 = &tmp->delmask; - } - - if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) - goto free_one; - - if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) - goto free_two; - - return 0; + /* + * Array of pointers to the three cpumask_var_t fields in tmpmasks. + * Note: Array size must match actual number of masks (3) + */ + cpumask_var_t *pmask[3] = { + &tmp->new_cpus, + &tmp->addmask, + &tmp->delmask + }; -free_two: - free_cpumask_var(*pmask2); -free_one: - free_cpumask_var(*pmask1); - return -ENOMEM; + return alloc_cpumasks(pmask, ARRAY_SIZE(pmask)); } /** - * free_cpumasks - free cpumasks in a tmpmasks structure - * @cs: the cpuset that have cpumasks to be free. + * free_tmpmasks - free cpumasks in a tmpmasks structure * @tmp: the tmpmasks structure pointer */ -static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline void free_tmpmasks(struct tmpmasks *tmp) { - if (cs) { - free_cpumask_var(cs->cpus_allowed); - free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->subparts_cpus); - } - if (tmp) { - free_cpumask_var(tmp->new_cpus); - free_cpumask_var(tmp->addmask); - free_cpumask_var(tmp->delmask); - } + if (!tmp) + return; + + free_cpumask_var(tmp->new_cpus); + free_cpumask_var(tmp->addmask); + free_cpumask_var(tmp->delmask); } /** - * alloc_trial_cpuset - allocate a trial cpuset - * @cs: the cpuset that the trial cpuset duplicates + * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset + * @cs: Source cpuset to duplicate (NULL for a fresh allocation) + * + * Creates a new cpuset by either: + * 1. Duplicating an existing cpuset (if @cs is non-NULL), or + * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) + * + * Return: Pointer to newly allocated cpuset on success, NULL on failure */ -static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) +static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) { struct cpuset *trial; - trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + /* Allocate base structure */ + trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : + kzalloc(sizeof(*cs), GFP_KERNEL); if (!trial) return NULL; - if (alloc_cpumasks(trial, NULL)) { + /* Setup cpumask pointer array */ + cpumask_var_t *pmask[4] = { + &trial->cpus_allowed, + &trial->effective_cpus, + &trial->effective_xcpus, + &trial->exclusive_cpus + }; + + if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) { kfree(trial); return NULL; } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); - cpumask_copy(trial->effective_cpus, cs->effective_cpus); + /* Copy masks if duplicating */ + if (cs) { + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + } + return trial; } @@ -562,10 +569,82 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static inline void free_cpuset(struct cpuset *cs) { - free_cpumasks(cs, NULL); + free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->effective_cpus); + free_cpumask_var(cs->effective_xcpus); + free_cpumask_var(cs->exclusive_cpus); kfree(cs); } +/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) +{ + return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed + : cs->exclusive_cpus; +} + +static inline bool xcpus_empty(struct cpuset *cs) +{ + return cpumask_empty(cs->cpus_allowed) && + cpumask_empty(cs->exclusive_cpus); +} + +/* + * cpusets_are_exclusive() - check if two cpusets are exclusive + * + * Return true if exclusive, false if not + */ +static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) +{ + struct cpumask *xcpus1 = user_xcpus(cs1); + struct cpumask *xcpus2 = user_xcpus(cs2); + + if (cpumask_intersects(xcpus1, xcpus2)) + return false; + return true; +} + +/** + * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * Conflict detection rules: + * 1. If either cpuset is CPU exclusive, they must be mutually exclusive + * 2. exclusive_cpus masks cannot intersect between cpusets + * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + */ +static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + /* If either cpuset is exclusive, check if they are mutually exclusive */ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return !cpusets_are_exclusive(cs1, cs2); + + /* Exclusive_cpus cannot intersect */ + if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) + return true; + + /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ + if (!cpumask_empty(cs1->cpus_allowed) && + cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) + return true; + + if (!cpumask_empty(cs2->cpus_allowed) && + cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + return true; + + return false; +} + +static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2))) + return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); + return false; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -590,50 +669,27 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; - int ret; + int ret = 0; rcu_read_lock(); - /* Each of our child cpusets must be a subset of us */ - ret = -EBUSY; - cpuset_for_each_child(c, css, cur) - if (!is_cpuset_subset(c, trial)) - goto out; + if (!is_in_v2_mode()) + ret = cpuset1_validate_change(cur, trial); + if (ret) + goto out; /* Remaining checks don't apply to root cpuset */ - ret = 0; if (cur == &top_cpuset) goto out; par = parent_cs(cur); - /* On legacy hiearchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) - goto out; - - /* - * If either I or some sibling (!= me) is exclusive, we can't - * overlap - */ - ret = -EINVAL; - cpuset_for_each_child(c, css, par) { - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - goto out; - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) - goto out; - } - /* * Cpusets with tasks - existing or newly being attached - can't * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { + if (cpuset_is_populated(cur)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; @@ -644,14 +700,40 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* * We can't shrink if we won't have enough room for SCHED_DEADLINE - * tasks. + * tasks. This check is not done when scheduling is disabled as the + * users should know what they are doing. + * + * For v1, effective_cpus == cpus_allowed & user_xcpus() returns + * cpus_allowed. + * + * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only + * for non-isolated partition root. At this point, the target + * effective_cpus isn't computed yet. user_xcpus() is the best + * approximation. + * + * TBD: May need to precompute the real effective_cpus here in case + * incorrect scheduling of SCHED_DEADLINE tasks in a partition + * becomes an issue. */ ret = -EBUSY; - if (is_cpu_exclusive(cur) && - !cpuset_cpumask_can_shrink(cur->cpus_allowed, - trial->cpus_allowed)) + if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) && + !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial))) goto out; + /* + * If either I or some sibling (!= me) is exclusive, we can't + * overlap. exclusive_cpus cannot overlap with each other if set. + */ + ret = -EINVAL; + cpuset_for_each_child(c, css, par) { + if (c == cur) + continue; + if (cpus_excl_conflict(trial, c)) + goto out; + if (mems_excl_conflict(trial, c)) + goto out; + } + ret = 0; out: rcu_read_unlock(); @@ -714,7 +796,7 @@ static inline int nr_cpusets(void) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt + * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this @@ -725,11 +807,10 @@ static inline int nr_cpusets(void) * Must be called with cpuset_mutex held. * * The three key local variables below are: - * q - a linked-list queue of cpuset pointers, used to implement a - * top-down scan of all cpusets. This scan loads a pointer - * to each cpuset marked is_sched_load_balance into the - * array 'csa'. For our purposes, rebuilding the schedulers - * sched domains, we can ignore !is_sched_load_balance cpusets. + * cp - cpuset pointer, used (together with pos_css) to perform a + * top-down scan of all cpusets. For our purposes, rebuilding + * the schedulers sched domains, we can ignore !is_sched_load_ + * balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, @@ -744,39 +825,39 @@ static inline int nr_cpusets(void) * were changed (added or removed.) * * Finding the best partition (set of domains): - * The triple nested loops below over i, j, k scan over the - * load balanced cpusets (using the array of cpuset pointers in - * csa[]) looking for pairs of cpusets that have overlapping - * cpus_allowed, but which don't have the same 'pn' partition - * number and gives them in the same partition number. It keeps - * looping on the 'restart' label until it can no longer find - * any such pairs. - * - * The union of the cpus_allowed masks from the set of - * all cpusets having the same 'pn' value then form the one - * element of the partition (one sched domain) to be passed to - * partition_sched_domains(). + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). + * */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { - struct cpuset *cp; /* scans q */ + struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ - int i, j, k; /* indices for partition finding loops */ + int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); + bool cgrpv2 = cpuset_v2(); + int nslot_update; doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts_cpus) { + if (root_load_balance && cpumask_empty(subpartitions_cpus)) { +single_root_domain: ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -788,7 +869,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); goto done; } @@ -804,63 +885,81 @@ static int generate_sched_domains(cpumask_var_t **domains, cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; + + if (cgrpv2) + goto v2; + /* + * v1: * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. - * - * If root is load-balancing, we can skip @cp if it - * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_FLAG_DOMAIN)))) + housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; - if (root_load_balance && - cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) - continue; + if (is_sched_load_balance(cp) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; - if (is_sched_load_balance(cp)) + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + +v2: + /* + * Only valid partition roots that are not isolated and with + * non-empty effective_cpus will be saved into csn[]. + */ + if ((cp->partition_root_state == PRS_ROOT) && + !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; - /* skip @cp's subtree if not a partition root */ - if (!is_partition_root(cp)) + /* + * Skip @cp's subtree if not a partition root and has no + * exclusive CPUs to be granted to child cpusets. + */ + if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); + /* + * If there are only isolated partitions underneath the cgroup root, + * we can optimize out unneeded sched domains scanning. + */ + if (root_load_balance && (csn == 1)) + goto single_root_domain; + for (i = 0; i < csn; i++) - csa[i]->pn = i; - ndoms = csn; + uf_node_init(&csa[i]->node); -restart: - /* Find the best partition (set of sched domains) */ + /* Merge overlapping cpusets */ for (i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - int apn = a->pn; - - for (j = 0; j < csn; j++) { - struct cpuset *b = csa[j]; - int bpn = b->pn; - - if (apn != bpn && cpusets_overlap(a, b)) { - for (k = 0; k < csn; k++) { - struct cpuset *c = csa[k]; - - if (c->pn == bpn) - c->pn = apn; - } - ndoms--; /* one less element */ - goto restart; + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) { + /* + * Cgroup v2 shouldn't pass down overlapping + * partition root cpusets. + */ + WARN_ON_ONCE(cgrpv2); + uf_union(&csa[i]->node, &csa[j]->node); } } } + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. @@ -876,45 +975,48 @@ restart: dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL); - for (nslot = 0, i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - struct cpumask *dp; - int apn = a->pn; - - if (apn < 0) { - /* Skip completed partitions */ - continue; - } - - dp = doms[nslot]; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; + /* + * Cgroup v2 doesn't support domain attributes, just set all of them + * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a + * subset of HK_TYPE_DOMAIN housekeeping CPUs. + */ + if (cgrpv2) { + for (i = 0; i < ndoms; i++) { + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (csa[i] == &top_cpuset) + cpumask_and(doms[i], csa[i]->effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; } + goto done; + } - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; + for (nslot = 0, i = 0; i < csn; i++) { + nslot_update = 0; for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; + + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) - update_domain_attr_tree(dattr + nslot, b); - - /* Done with this partition */ - b->pn = -1; + update_domain_attr_tree(dattr + nslot, csa[j]); } } - nslot++; + if (nslot_update) + nslot++; } BUG_ON(nslot != ndoms); @@ -933,6 +1035,61 @@ done: return ndoms; } +static void dl_update_tasks_root_domain(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + if (cs->nr_deadline_tasks == 0) + return; + + css_task_iter_start(&cs->css, 0, &it); + + while ((task = css_task_iter_next(&it))) + dl_add_task_root_domain(task); + + css_task_iter_end(&it); +} + +void dl_rebuild_rd_accounting(void) +{ + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; + int cpu; + u64 cookie = ++dl_cookie; + + lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + + rcu_read_lock(); + + for_each_possible_cpu(cpu) { + if (dl_bw_visited(cpu, cookie)) + continue; + + dl_clear_root_domain_cpu(cpu); + } + + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + + if (cpumask_empty(cs->effective_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + css_get(&cs->css); + + rcu_read_unlock(); + + dl_update_tasks_root_domain(cs); + + rcu_read_lock(); + css_put(&cs->css); + } + rcu_read_unlock(); +} + /* * Rebuild scheduler domains. * @@ -942,67 +1099,127 @@ done: * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). + * Call with cpuset_mutex held. Takes cpus_read_lock(). */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { + struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; + struct cpuset *cs; int ndoms; + lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); + force_sd_rebuild = false; /* - * We have raced with CPU hotplug. Don't do anything to avoid + * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, hotplug work item will rebuild sched domains. + * Anyways, cpuset_handle_hotplug() will rebuild sched domains. + * + * With no CPUs in any subpartitions, top_cpuset's effective CPUs + * should be the same as the active CPUs, so checking only top_cpuset + * is enough to detect racing CPU offlines. */ - if (!top_cpuset.nr_subparts_cpus && + if (cpumask_empty(subpartitions_cpus) && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; - if (top_cpuset.nr_subparts_cpus && - !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + /* + * With subpartition CPUs, however, the effective CPUs of a partition + * root should be only a subset of the active CPUs. Since a CPU in any + * partition root could be offlined, all must be checked. + */ + if (!cpumask_empty(subpartitions_cpus)) { + rcu_read_lock(); + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + if (!is_partition_valid(cs)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + if (!cpumask_subset(cs->effective_cpus, + cpu_active_mask)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + } /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ -void rebuild_sched_domains(void) +static void rebuild_sched_domains_cpuslocked(void) { mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); } +void rebuild_sched_domains(void) +{ + cpus_read_lock(); + rebuild_sched_domains_cpuslocked(); + cpus_read_unlock(); +} + +void cpuset_reset_sched_domains(void) +{ + mutex_lock(&cpuset_mutex); + partition_sched_domains(1, NULL, NULL); + mutex_unlock(&cpuset_mutex); +} + /** - * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. + * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. + * + * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus + * to make sure all offline CPUs are also included as hotplug code won't + * update cpumasks for tasks in top_cpuset. + * + * As task_cpu_possible_mask() can be task dependent in arm64, we have to + * do cpu masking per task instead of doing it once for all. */ -static void update_tasks_cpumask(struct cpuset *cs) +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; + bool top_cs = cs == &top_cpuset; css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) - set_cpus_allowed_ptr(task, cs->effective_cpus); + while ((task = css_task_iter_next(&it))) { + const struct cpumask *possible_mask = task_cpu_possible_mask(task); + + if (top_cs) { + /* + * PF_NO_SETAFFINITY tasks are ignored. + * All per cpu kthreads should have PF_NO_SETAFFINITY + * flag set, see kthread_set_per_cpu(). + */ + if (task->flags & PF_NO_SETAFFINITY) + continue; + cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); + } else { + cpumask_and(new_cpus, possible_mask, cs->effective_cpus); + } + set_cpus_allowed_ptr(task, new_cpus); + } css_task_iter_end(&it); } @@ -1012,375 +1229,1137 @@ static void update_tasks_cpumask(struct cpuset *cs) * @cs: the cpuset the need to recompute the new effective_cpus mask * @parent: the parent cpuset * - * If the parent has subpartition CPUs, include them in the list of - * allowable CPUs in computing the new effective_cpus mask. Since offlined - * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask - * to mask those out. + * The result is valid only if the given cpuset isn't a partition root. */ static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { - if (parent->nr_subparts_cpus) { - cpumask_or(new_cpus, parent->effective_cpus, - parent->subparts_cpus); - cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); - cpumask_and(new_cpus, new_cpus, cpu_active_mask); + cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); +} + +/* + * Commands for update_parent_effective_cpumask + */ +enum partition_cmd { + partcmd_enable, /* Enable partition root */ + partcmd_enablei, /* Enable isolated partition root */ + partcmd_disable, /* Disable partition root */ + partcmd_update, /* Update parent's effective_cpus */ + partcmd_invalidate, /* Make partition invalid */ +}; + +static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, + struct tmpmasks *tmp); + +/* + * Update partition exclusive flag + * + * Return: 0 if successful, an error code otherwise + */ +static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs) +{ + bool exclusive = (new_prs > PRS_MEMBER); + + if (exclusive && !is_cpu_exclusive(cs)) { + if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1)) + return PERR_NOTEXCL; + } else if (!exclusive && is_cpu_exclusive(cs)) { + /* Turning off CS_CPU_EXCLUSIVE will not return error */ + cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0); + } + return 0; +} + +/* + * Update partition load balance flag and/or rebuild sched domain + * + * Changing load balance flag will automatically call + * rebuild_sched_domains_locked(). + * This function is for cgroup v2 only. + */ +static void update_partition_sd_lb(struct cpuset *cs, int old_prs) +{ + int new_prs = cs->partition_root_state; + bool rebuild_domains = (new_prs > 0) || (old_prs > 0); + bool new_lb; + + /* + * If cs is not a valid partition root, the load balance state + * will follow its parent. + */ + if (new_prs > 0) { + new_lb = (new_prs != PRS_ISOLATED); } else { - cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); + new_lb = is_sched_load_balance(parent_cs(cs)); + } + if (new_lb != !!is_sched_load_balance(cs)) { + rebuild_domains = true; + if (new_lb) + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } + + if (rebuild_domains) + cpuset_force_rebuild(); } /* - * Commands for update_parent_subparts_cpumask + * tasks_nocpu_error - Return true if tasks will have no effective_cpus */ -enum subparts_cmd { - partcmd_enable, /* Enable partition root */ - partcmd_disable, /* Disable partition root */ - partcmd_update, /* Update parent's subparts_cpus */ -}; +static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, + struct cpumask *xcpus) +{ + /* + * A populated partition (cs or parent) can't have empty effective_cpus + */ + return (cpumask_subset(parent->effective_cpus, xcpus) && + partition_is_populated(parent, cs)) || + (!cpumask_intersects(xcpus, cpu_active_mask) && + partition_is_populated(cs, NULL)); +} + +static void reset_partition_data(struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(cs); + + if (!cpuset_v2()) + return; + + lockdep_assert_held(&callback_lock); + + if (cpumask_empty(cs->exclusive_cpus)) { + cpumask_clear(cs->effective_xcpus); + if (is_cpu_exclusive(cs)) + clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); + } + if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) + cpumask_copy(cs->effective_cpus, parent->effective_cpus); +} + +/* + * isolated_cpus_update - Update the isolated_cpus mask + * @old_prs: old partition_root_state + * @new_prs: new partition_root_state + * @xcpus: exclusive CPUs with state change + */ +static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) +{ + WARN_ON_ONCE(old_prs == new_prs); + if (new_prs == PRS_ISOLATED) + cpumask_or(isolated_cpus, isolated_cpus, xcpus); + else + cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); + + isolated_cpus_updating = true; +} + +/* + * partition_xcpus_add - Add new exclusive CPUs to partition + * @new_prs: new partition_root_state + * @parent: parent cpuset + * @xcpus: exclusive CPUs to be added + * + * Remote partition if parent == NULL + */ +static void partition_xcpus_add(int new_prs, struct cpuset *parent, + struct cpumask *xcpus) +{ + WARN_ON_ONCE(new_prs < 0); + lockdep_assert_held(&callback_lock); + if (!parent) + parent = &top_cpuset; + + + if (parent == &top_cpuset) + cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); + + if (new_prs != parent->partition_root_state) + isolated_cpus_update(parent->partition_root_state, new_prs, + xcpus); + + cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); +} + +/* + * partition_xcpus_del - Remove exclusive CPUs from partition + * @old_prs: old partition_root_state + * @parent: parent cpuset + * @xcpus: exclusive CPUs to be removed + * + * Remote partition if parent == NULL + */ +static void partition_xcpus_del(int old_prs, struct cpuset *parent, + struct cpumask *xcpus) +{ + WARN_ON_ONCE(old_prs < 0); + lockdep_assert_held(&callback_lock); + if (!parent) + parent = &top_cpuset; + + if (parent == &top_cpuset) + cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); + + if (old_prs != parent->partition_root_state) + isolated_cpus_update(old_prs, parent->partition_root_state, + xcpus); + + cpumask_and(xcpus, xcpus, cpu_active_mask); + cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); +} + +/* + * isolated_cpus_can_update - check for isolated & nohz_full conflicts + * @add_cpus: cpu mask for cpus that are going to be isolated + * @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL + * Return: false if there is conflict, true otherwise + * + * If nohz_full is enabled and we have isolated CPUs, their combination must + * still leave housekeeping CPUs. + * + * TBD: Should consider merging this function into + * prstate_housekeeping_conflict(). + */ +static bool isolated_cpus_can_update(struct cpumask *add_cpus, + struct cpumask *del_cpus) +{ + cpumask_var_t full_hk_cpus; + int res = true; + + if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE)) + return true; + + if (del_cpus && cpumask_weight_and(del_cpus, + housekeeping_cpumask(HK_TYPE_KERNEL_NOISE))) + return true; + + if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL)) + return false; + + cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE), + housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus); + cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask); + if (!cpumask_weight_andnot(full_hk_cpus, add_cpus)) + res = false; + + free_cpumask_var(full_hk_cpus); + return res; +} + +/* + * prstate_housekeeping_conflict - check for partition & housekeeping conflicts + * @prstate: partition root state to be checked + * @new_cpus: cpu mask + * Return: true if there is conflict, false otherwise + * + * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * isolated partition. + */ +static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) +{ + if (!have_boot_isolcpus) + return false; + + if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) + return true; + + return false; +} + +/* + * update_isolation_cpumasks - Update external isolation related CPU masks + * + * The following external CPU masks will be updated if necessary: + * - workqueue unbound cpumask + */ +static void update_isolation_cpumasks(void) +{ + int ret; + + if (!isolated_cpus_updating) + return; + + lockdep_assert_cpus_held(); + + ret = workqueue_unbound_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); + + ret = tmigr_isolated_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); + + isolated_cpus_updating = false; +} /** - * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset - * @cpuset: The cpuset that requests change in partition root state - * @cmd: Partition root state change command - * @newmask: Optional new cpumask for partcmd_update - * @tmp: Temporary addmask and delmask - * Return: 0, 1 or an error code - * - * For partcmd_enable, the cpuset is being transformed from a non-partition - * root to a partition root. The cpus_allowed mask of the given cpuset will - * be put into parent's subparts_cpus and taken away from parent's - * effective_cpus. The function will return 0 if all the CPUs listed in - * cpus_allowed can be granted or an error code will be returned. - * - * For partcmd_disable, the cpuset is being transofrmed from a partition - * root back to a non-partition root. any CPUs in cpus_allowed that are in - * parent's subparts_cpus will be taken away from that cpumask and put back - * into parent's effective_cpus. 0 should always be returned. - * - * For partcmd_update, if the optional newmask is specified, the cpu - * list is to be changed from cpus_allowed to newmask. Otherwise, - * cpus_allowed is assumed to remain the same. The cpuset should either - * be a partition root or an invalid partition root. The partition root - * state may change if newmask is NULL and none of the requested CPUs can - * be granted by the parent. The function will return 1 if changes to - * parent's subparts_cpus and effective_cpus happen or 0 otherwise. - * Error code should only be returned when newmask is non-NULL. - * - * The partcmd_enable and partcmd_disable commands are used by - * update_prstate(). The partcmd_update command is used by - * update_cpumasks_hier() with newmask NULL and update_cpumask() with - * newmask set. - * - * The checking is more strict when enabling partition root than the - * other two commands. - * - * Because of the implicit cpu exclusive nature of a partition root, - * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). The validate_change() - * function will also prevent any changes to the cpu list if it is not - * a superset of children's cpu lists. - */ -static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, - struct cpumask *newmask, - struct tmpmasks *tmp) -{ - struct cpuset *parent = parent_cs(cpuset); - int adding; /* Moving cpus from effective_cpus to subparts_cpus */ - int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ - bool part_error = false; /* Partition error? */ + * cpuset_cpu_is_isolated - Check if the given CPU is isolated + * @cpu: the CPU number to be checked + * Return: true if CPU is used in an isolated partition, false otherwise + */ +bool cpuset_cpu_is_isolated(int cpu) +{ + return cpumask_test_cpu(cpu, isolated_cpus); +} +EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); - lockdep_assert_held(&cpuset_mutex); +/** + * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets + * @parent: Parent cpuset containing all siblings + * @cs: Current cpuset (will be skipped) + * @excpus: exclusive effective CPU mask to modify + * + * This function ensures the given @excpus mask doesn't include any CPUs that + * are exclusively allocated to sibling cpusets. It walks through all siblings + * of @cs under @parent and removes their exclusive CPUs from @excpus. + */ +static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, + struct cpumask *excpus) +{ + struct cgroup_subsys_state *css; + struct cpuset *sibling; + int retval = 0; + + if (cpumask_empty(excpus)) + return retval; /* - * The parent must be a partition root. - * The new cpumask, if present, or the current cpus_allowed must - * not be empty. + * Exclude exclusive CPUs from siblings */ - if (!is_partition_root(parent) || - (newmask && cpumask_empty(newmask)) || - (!newmask && cpumask_empty(cpuset->cpus_allowed))) - return -EINVAL; + rcu_read_lock(); + cpuset_for_each_child(sibling, css, parent) { + if (sibling == cs) + continue; + + if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { + cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); + retval++; + continue; + } + if (cpumask_intersects(excpus, sibling->effective_xcpus)) { + cpumask_andnot(excpus, excpus, sibling->effective_xcpus); + retval++; + } + } + rcu_read_unlock(); + + return retval; +} + +/* + * compute_excpus - compute effective exclusive CPUs + * @cs: cpuset + * @xcpus: effective exclusive CPUs value to be set + * Return: 0 if there is no sibling conflict, > 0 otherwise + * + * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets + * and exclude their exclusive_cpus or effective_xcpus as well. + */ +static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) +{ + struct cpuset *parent = parent_cs(cs); + + cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus); + + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + +/* + * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset + * @trialcs: The trial cpuset containing the proposed new configuration + * @cs: The original cpuset that the trial configuration is based on + * Return: 0 if successful with no sibling conflict, >0 if a conflict is found + * + * Computes the effective_xcpus for a trial configuration. @cs is provided to represent + * the real cs. + */ +static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(trialcs); + struct cpumask *excpus = trialcs->effective_xcpus; + + /* trialcs is member, cpuset.cpus has no impact to excpus */ + if (cs_is_member(cs)) + cpumask_and(excpus, trialcs->exclusive_cpus, + parent->effective_xcpus); + else + cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + +static inline bool is_remote_partition(struct cpuset *cs) +{ + return cs->remote_partition; +} + +static inline bool is_local_partition(struct cpuset *cs) +{ + return is_partition_valid(cs) && !is_remote_partition(cs); +} +/* + * remote_partition_enable - Enable current cpuset as a remote partition root + * @cs: the cpuset to update + * @new_prs: new partition_root_state + * @tmp: temporary masks + * Return: 0 if successful, errcode if error + * + * Enable the current cpuset to become a remote partition root taking CPUs + * directly from the top cpuset. cpuset_mutex must be held by the caller. + */ +static int remote_partition_enable(struct cpuset *cs, int new_prs, + struct tmpmasks *tmp) +{ /* - * Enabling/disabling partition root is not allowed if there are - * online children. + * The user must have sysadmin privilege. */ - if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css)) - return -EBUSY; + if (!capable(CAP_SYS_ADMIN)) + return PERR_ACCESS; /* - * Enabling partition root is not allowed if not all the CPUs - * can be granted from parent's effective_cpus or at least one - * CPU will be left after that. + * The requested exclusive_cpus must not be allocated to other + * partitions and it can't use up all the root's effective_cpus. + * + * The effective_xcpus mask can contain offline CPUs, but there must + * be at least one or more online CPUs present before it can be enabled. + * + * Note that creating a remote partition with any local partition root + * above it or remote partition root underneath it is not allowed. */ - if ((cmd == partcmd_enable) && - (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) || - cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus))) - return -EINVAL; + compute_excpus(cs, tmp->new_cpus); + WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); + if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || + cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) + return PERR_INVCPUS; + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->new_cpus, NULL)) || + prstate_housekeeping_conflict(new_prs, tmp->new_cpus)) + return PERR_HKEEPING; + + spin_lock_irq(&callback_lock); + partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + cs->remote_partition = true; + cpumask_copy(cs->effective_xcpus, tmp->new_cpus); + spin_unlock_irq(&callback_lock); + update_isolation_cpumasks(); + cpuset_force_rebuild(); + cs->prs_err = 0; + + /* + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. + */ + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + return 0; +} + +/* + * remote_partition_disable - Remove current cpuset from remote partition list + * @cs: the cpuset to update + * @tmp: temporary masks + * + * The effective_cpus is also updated. + * + * cpuset_mutex must be held by the caller. + */ +static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) +{ + WARN_ON_ONCE(!is_remote_partition(cs)); + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); + + spin_lock_irq(&callback_lock); + cs->remote_partition = false; + partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus); + if (cs->prs_err) + cs->partition_root_state = -cs->partition_root_state; + else + cs->partition_root_state = PRS_MEMBER; + + /* effective_xcpus may need to be changed */ + compute_excpus(cs, cs->effective_xcpus); + reset_partition_data(cs); + spin_unlock_irq(&callback_lock); + update_isolation_cpumasks(); + cpuset_force_rebuild(); + + /* + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. + */ + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); +} + +/* + * remote_cpus_update - cpus_exclusive change of remote partition + * @cs: the cpuset to be updated + * @xcpus: the new exclusive_cpus mask, if non-NULL + * @excpus: the new effective_xcpus mask + * @tmp: temporary masks + * + * top_cpuset and subpartitions_cpus will be updated or partition can be + * invalidated. + */ +static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, + struct cpumask *excpus, struct tmpmasks *tmp) +{ + bool adding, deleting; + int prs = cs->partition_root_state; + + if (WARN_ON_ONCE(!is_remote_partition(cs))) + return; + + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); + + if (cpumask_empty(excpus)) { + cs->prs_err = PERR_CPUSEMPTY; + goto invalidate; + } + + adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus); + deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus); + + /* + * Additions of remote CPUs is only allowed if those CPUs are + * not allocated to other partitions and there are effective_cpus + * left in the top cpuset. + */ + if (adding) { + WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus)); + if (!capable(CAP_SYS_ADMIN)) + cs->prs_err = PERR_ACCESS; + else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) + cs->prs_err = PERR_NOCPUS; + else if ((prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + cs->prs_err = PERR_HKEEPING; + if (cs->prs_err) + goto invalidate; + } + + spin_lock_irq(&callback_lock); + if (adding) + partition_xcpus_add(prs, NULL, tmp->addmask); + if (deleting) + partition_xcpus_del(prs, NULL, tmp->delmask); + /* + * Need to update effective_xcpus and exclusive_cpus now as + * update_sibling_cpumasks() below may iterate back to the same cs. + */ + cpumask_copy(cs->effective_xcpus, excpus); + if (xcpus) + cpumask_copy(cs->exclusive_cpus, xcpus); + spin_unlock_irq(&callback_lock); + update_isolation_cpumasks(); + if (adding || deleting) + cpuset_force_rebuild(); /* - * A cpumask update cannot make parent's effective_cpus become empty. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. + */ + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + return; + +invalidate: + remote_partition_disable(cs, tmp); +} + +/** + * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset + * @cs: The cpuset that requests change in partition root state + * @cmd: Partition root state change command + * @newmask: Optional new cpumask for partcmd_update + * @tmp: Temporary addmask and delmask + * Return: 0 or a partition root state error code + * + * For partcmd_enable*, the cpuset is being transformed from a non-partition + * root to a partition root. The effective_xcpus (cpus_allowed if + * effective_xcpus not set) mask of the given cpuset will be taken away from + * parent's effective_cpus. The function will return 0 if all the CPUs listed + * in effective_xcpus can be granted or an error code will be returned. + * + * For partcmd_disable, the cpuset is being transformed from a partition + * root back to a non-partition root. Any CPUs in effective_xcpus will be + * given back to parent's effective_cpus. 0 will always be returned. + * + * For partcmd_update, if the optional newmask is specified, the cpu list is + * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is + * assumed to remain the same. The cpuset should either be a valid or invalid + * partition root. The partition root state may change from valid to invalid + * or vice versa. An error code will be returned if transitioning from + * invalid to valid violates the exclusivity rule. + * + * For partcmd_invalidate, the current partition will be made invalid. + * + * The partcmd_enable* and partcmd_disable commands are used by + * update_prstate(). An error code may be returned and the caller will check + * for error. + * + * The partcmd_update command is used by update_cpumasks_hier() with newmask + * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used + * by update_cpumask() with NULL newmask. In both cases, the callers won't + * check for error and so partition_root_state and prs_err will be updated + * directly. + */ +static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, + struct cpumask *newmask, + struct tmpmasks *tmp) +{ + struct cpuset *parent = parent_cs(cs); + int adding; /* Adding cpus to parent's effective_cpus */ + int deleting; /* Deleting cpus from parent's effective_cpus */ + int old_prs, new_prs; + int part_error = PERR_NONE; /* Partition error? */ + struct cpumask *xcpus = user_xcpus(cs); + int parent_prs = parent->partition_root_state; + bool nocpu; + + lockdep_assert_held(&cpuset_mutex); + WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ + + /* + * new_prs will only be changed for the partcmd_update and + * partcmd_invalidate commands. */ adding = deleting = false; - if (cmd == partcmd_enable) { - cpumask_copy(tmp->addmask, cpuset->cpus_allowed); - adding = true; + old_prs = new_prs = cs->partition_root_state; + + if (cmd == partcmd_invalidate) { + if (is_partition_invalid(cs)) + return 0; + + /* + * Make the current partition invalid. + */ + if (is_partition_valid(parent)) + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); + if (old_prs > 0) + new_prs = -old_prs; + + goto write_error; + } + + /* + * The parent must be a partition root. + * The new cpumask, if present, or the current cpus_allowed must + * not be empty. + */ + if (!is_partition_valid(parent)) { + return is_partition_invalid(parent) + ? PERR_INVPARENT : PERR_NOTPART; + } + if (!newmask && xcpus_empty(cs)) + return PERR_CPUSEMPTY; + + nocpu = tasks_nocpu_error(parent, cs, xcpus); + + if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { + /* + * Need to call compute_excpus() in case + * exclusive_cpus not set. Sibling conflict should only happen + * if exclusive_cpus isn't set. + */ + xcpus = tmp->delmask; + if (compute_excpus(cs, xcpus)) + WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; + + /* + * Enabling partition root is not allowed if its + * effective_xcpus is empty. + */ + if (cpumask_empty(xcpus)) + return PERR_INVCPUS; + + if (prstate_housekeeping_conflict(new_prs, xcpus)) + return PERR_HKEEPING; + + if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) && + !isolated_cpus_can_update(xcpus, NULL)) + return PERR_HKEEPING; + + if (tasks_nocpu_error(parent, cs, xcpus)) + return PERR_NOCPUS; + + /* + * This function will only be called when all the preliminary + * checks have passed. At this point, the following condition + * should hold. + * + * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus + * + * Warn if it is not the case. + */ + cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); + + deleting = true; } else if (cmd == partcmd_disable) { - deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, - parent->subparts_cpus); + /* + * May need to add cpus back to parent's effective_cpus + * (and maybe removed from subpartitions_cpus/isolated_cpus) + * for valid partition root. xcpus may contain CPUs that + * shouldn't be removed from the two global cpumasks. + */ + if (is_partition_valid(cs)) { + cpumask_copy(tmp->addmask, cs->effective_xcpus); + adding = true; + } + new_prs = PRS_MEMBER; } else if (newmask) { /* + * Empty cpumask is not allowed + */ + if (cpumask_empty(newmask)) { + part_error = PERR_CPUSEMPTY; + goto write_error; + } + + /* Check newmask again, whether cpus are available for parent/cs */ + nocpu |= tasks_nocpu_error(parent, cs, newmask); + + /* * partcmd_update with newmask: * - * delmask = cpus_allowed & ~newmask & parent->subparts_cpus - * addmask = newmask & parent->effective_cpus - * & ~parent->subparts_cpus + * Compute add/delete mask to/from effective_cpus + * + * For valid partition: + * addmask = exclusive_cpus & ~newmask + * & parent->effective_xcpus + * delmask = newmask & ~exclusive_cpus + * & parent->effective_xcpus + * + * For invalid partition: + * delmask = newmask & parent->effective_xcpus + * The partition may become valid soon. */ - cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask); - deleting = cpumask_and(tmp->delmask, tmp->delmask, - parent->subparts_cpus); + if (is_partition_invalid(cs)) { + adding = false; + deleting = cpumask_and(tmp->delmask, + newmask, parent->effective_xcpus); + } else { + cpumask_andnot(tmp->addmask, xcpus, newmask); + adding = cpumask_and(tmp->addmask, tmp->addmask, + parent->effective_xcpus); + + cpumask_andnot(tmp->delmask, newmask, xcpus); + deleting = cpumask_and(tmp->delmask, tmp->delmask, + parent->effective_xcpus); + } - cpumask_and(tmp->addmask, newmask, parent->effective_cpus); - adding = cpumask_andnot(tmp->addmask, tmp->addmask, - parent->subparts_cpus); /* - * Return error if the new effective_cpus could become empty. + * TBD: Invalidate a currently valid child root partition may + * still break isolated_cpus_can_update() rule if parent is an + * isolated partition. */ - if (adding && - cpumask_equal(parent->effective_cpus, tmp->addmask)) { - if (!deleting) - return -EINVAL; - /* - * As some of the CPUs in subparts_cpus might have - * been offlined, we need to compute the real delmask - * to confirm that. - */ - if (!cpumask_and(tmp->addmask, tmp->delmask, - cpu_active_mask)) - return -EINVAL; - cpumask_copy(tmp->addmask, parent->effective_cpus); + if (is_partition_valid(cs) && (old_prs != parent_prs)) { + if ((parent_prs == PRS_ROOT) && + /* Adding to parent means removing isolated CPUs */ + !isolated_cpus_can_update(tmp->delmask, tmp->addmask)) + part_error = PERR_HKEEPING; + if ((parent_prs == PRS_ISOLATED) && + /* Adding to parent means adding isolated CPUs */ + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + part_error = PERR_HKEEPING; + } + + /* + * The new CPUs to be removed from parent's effective CPUs + * must be present. + */ + if (deleting) { + cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); + } + + /* + * Make partition invalid if parent's effective_cpus could + * become empty and there are tasks in the parent. + */ + if (nocpu && (!adding || + !cpumask_intersects(tmp->addmask, cpu_active_mask))) { + part_error = PERR_NOCPUS; + deleting = false; + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); } } else { /* - * partcmd_update w/o newmask: + * partcmd_update w/o newmask + * + * delmask = effective_xcpus & parent->effective_cpus * - * addmask = cpus_allowed & parent->effectiveb_cpus + * This can be called from: + * 1) update_cpumasks_hier() + * 2) cpuset_hotplug_update_tasks() * - * Note that parent's subparts_cpus may have been - * pre-shrunk in case there is a change in the cpu list. - * So no deletion is needed. + * Check to see if it can be transitioned from valid to + * invalid partition or vice versa. + * + * A partition error happens when parent has tasks and all + * its effective CPUs will have to be distributed out. */ - adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed, - parent->effective_cpus); - part_error = cpumask_equal(tmp->addmask, - parent->effective_cpus); + if (nocpu) { + part_error = PERR_NOCPUS; + if (is_partition_valid(cs)) + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); + } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) && + cpumask_subset(xcpus, parent->effective_xcpus)) { + struct cgroup_subsys_state *css; + struct cpuset *child; + bool exclusive = true; + + /* + * Convert invalid partition to valid has to + * pass the cpu exclusivity test. + */ + rcu_read_lock(); + cpuset_for_each_child(child, css, parent) { + if (child == cs) + continue; + if (!cpusets_are_exclusive(cs, child)) { + exclusive = false; + break; + } + } + rcu_read_unlock(); + if (exclusive) + deleting = cpumask_and(tmp->delmask, + xcpus, parent->effective_cpus); + else + part_error = PERR_NOTEXCL; + } } - if (cmd == partcmd_update) { - int prev_prs = cpuset->partition_root_state; +write_error: + if (part_error) + WRITE_ONCE(cs->prs_err, part_error); + if (cmd == partcmd_update) { /* - * Check for possible transition between PRS_ENABLED - * and PRS_ERROR. + * Check for possible transition between valid and invalid + * partition root. */ - switch (cpuset->partition_root_state) { - case PRS_ENABLED: + switch (cs->partition_root_state) { + case PRS_ROOT: + case PRS_ISOLATED: if (part_error) - cpuset->partition_root_state = PRS_ERROR; + new_prs = -old_prs; break; - case PRS_ERROR: + case PRS_INVALID_ROOT: + case PRS_INVALID_ISOLATED: if (!part_error) - cpuset->partition_root_state = PRS_ENABLED; + new_prs = -old_prs; break; } - /* - * Set part_error if previously in invalid state. - */ - part_error = (prev_prs == PRS_ERROR); } - if (!part_error && (cpuset->partition_root_state == PRS_ERROR)) - return 0; /* Nothing need to be done */ + if (!adding && !deleting && (new_prs == old_prs)) + return 0; - if (cpuset->partition_root_state == PRS_ERROR) { - /* - * Remove all its cpus from parent's subparts_cpus. - */ - adding = false; - deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, - parent->subparts_cpus); - } + /* + * Transitioning between invalid to valid or vice versa may require + * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, + * validate_change() has already been successfully called and + * CPU lists in cs haven't been updated yet. So defer it to later. + */ + if ((old_prs != new_prs) && (cmd != partcmd_update)) { + int err = update_partition_exclusive_flag(cs, new_prs); - if (!adding && !deleting) - return 0; + if (err) + return err; + } /* - * Change the parent's subparts_cpus. + * Change the parent's effective_cpus & effective_xcpus (top cpuset + * only). + * * Newly added CPUs will be removed from effective_cpus and * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); - if (adding) { - cpumask_or(parent->subparts_cpus, - parent->subparts_cpus, tmp->addmask); - cpumask_andnot(parent->effective_cpus, - parent->effective_cpus, tmp->addmask); - } - if (deleting) { - cpumask_andnot(parent->subparts_cpus, - parent->subparts_cpus, tmp->delmask); - /* - * Some of the CPUs in subparts_cpus might have been offlined. - */ - cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask); - cpumask_or(parent->effective_cpus, - parent->effective_cpus, tmp->delmask); - } + if (old_prs != new_prs) + cs->partition_root_state = new_prs; + + /* + * Adding to parent's effective_cpus means deletion CPUs from cs + * and vice versa. + */ + if (adding) + partition_xcpus_del(old_prs, parent, tmp->addmask); + if (deleting) + partition_xcpus_add(new_prs, parent, tmp->delmask); - parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); spin_unlock_irq(&callback_lock); + update_isolation_cpumasks(); + + if ((old_prs != new_prs) && (cmd == partcmd_update)) + update_partition_exclusive_flag(cs, new_prs); + + if (adding || deleting) { + cpuset_update_tasks_cpumask(parent, tmp->addmask); + update_sibling_cpumasks(parent, cs, tmp); + } - return cmd == partcmd_update; + /* + * For partcmd_update without newmask, it is being called from + * cpuset_handle_hotplug(). Update the load balance flag and + * scheduling domain accordingly. + */ + if ((cmd == partcmd_update) && !newmask) + update_partition_sd_lb(cs, old_prs); + + notify_partition_change(cs, old_prs); + return 0; +} + +/** + * compute_partition_effective_cpumask - compute effective_cpus for partition + * @cs: partition root cpuset + * @new_ecpus: previously computed effective_cpus to be updated + * + * Compute the effective_cpus of a partition root by scanning effective_xcpus + * of child partition roots and excluding their effective_xcpus. + * + * This has the side effect of invalidating valid child partition roots, + * if necessary. Since it is called from either cpuset_hotplug_update_tasks() + * or update_cpumasks_hier() where parent and children are modified + * successively, we don't need to call update_parent_effective_cpumask() + * and the child's effective_cpus will be updated in later iterations. + * + * Note that rcu_read_lock() is assumed to be held. + */ +static void compute_partition_effective_cpumask(struct cpuset *cs, + struct cpumask *new_ecpus) +{ + struct cgroup_subsys_state *css; + struct cpuset *child; + bool populated = partition_is_populated(cs, NULL); + + /* + * Check child partition roots to see if they should be + * invalidated when + * 1) child effective_xcpus not a subset of new + * excluisve_cpus + * 2) All the effective_cpus will be used up and cp + * has tasks + */ + compute_excpus(cs, new_ecpus); + cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); + + rcu_read_lock(); + cpuset_for_each_child(child, css, cs) { + if (!is_partition_valid(child)) + continue; + + /* + * There shouldn't be a remote partition underneath another + * partition root. + */ + WARN_ON_ONCE(is_remote_partition(child)); + child->prs_err = 0; + if (!cpumask_subset(child->effective_xcpus, + cs->effective_xcpus)) + child->prs_err = PERR_INVCPUS; + else if (populated && + cpumask_subset(new_ecpus, child->effective_xcpus)) + child->prs_err = PERR_NOCPUS; + + if (child->prs_err) { + int old_prs = child->partition_root_state; + + /* + * Invalidate child partition + */ + spin_lock_irq(&callback_lock); + make_partition_invalid(child); + spin_unlock_irq(&callback_lock); + notify_partition_change(child, old_prs); + continue; + } + cpumask_andnot(new_ecpus, new_ecpus, + child->effective_xcpus); + } + rcu_read_unlock(); } /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup + * @force: don't skip any descendant cpusets if set * - * When congifured cpumask is changed, the effective cpumasks of this cpuset + * When configured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. * - * On legacy hierachy, effective_cpus will be the same with cpu_allowed. + * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * * Called with cpuset_mutex held */ -static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) +static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, + bool force) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; - bool need_rebuild_sched_domains = false; + int old_prs, new_prs; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); + bool remote = is_remote_partition(cp); + bool update_parent = false; - compute_effective_cpumask(tmp->new_cpus, cp, parent); + old_prs = new_prs = cp->partition_root_state; /* - * If it becomes empty, inherit the effective mask of the - * parent, which is guaranteed to have some CPUs. + * For child remote partition root (!= cs), we need to call + * remote_cpus_update() if effective_xcpus will be changed. + * Otherwise, we can skip the whole subtree. + * + * remote_cpus_update() will reuse tmp->new_cpus only after + * its value is being processed. */ - if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { - cpumask_copy(tmp->new_cpus, parent->effective_cpus); - if (!cp->use_parent_ecpus) { - cp->use_parent_ecpus = true; - parent->child_ecpus_count++; + if (remote && (cp != cs)) { + compute_excpus(cp, tmp->new_cpus); + if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; } - } else if (cp->use_parent_ecpus) { - cp->use_parent_ecpus = false; - WARN_ON_ONCE(!parent->child_ecpus_count); - parent->child_ecpus_count--; + rcu_read_unlock(); + remote_cpus_update(cp, NULL, tmp->new_cpus, tmp); + rcu_read_lock(); + + /* Remote partition may be invalidated */ + new_prs = cp->partition_root_state; + remote = (new_prs == old_prs); + } + + if (remote || (is_partition_valid(parent) && is_partition_valid(cp))) + compute_partition_effective_cpumask(cp, tmp->new_cpus); + else + compute_effective_cpumask(tmp->new_cpus, cp, parent); + + if (remote) + goto get_css; /* Ready to update cpuset data */ + + /* + * A partition with no effective_cpus is allowed as long as + * there is no task associated with it. Call + * update_parent_effective_cpumask() to check it. + */ + if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) { + update_parent = true; + goto update_parent_effective; } /* - * Skip the whole subtree if the cpumask remains the same - * and has no partition root state. + * If it becomes empty, inherit the effective mask of the + * parent, which is guaranteed to have some CPUs unless + * it is a partition root that has explicitly distributed + * out all its CPUs. + */ + if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) + cpumask_copy(tmp->new_cpus, parent->effective_cpus); + + /* + * Skip the whole subtree if + * 1) the cpumask remains the same, + * 2) has no partition root state, + * 3) force flag not set, and + * 4) for v2 load balance state same as its parent. */ - if (!cp->partition_root_state && - cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { + if (!cp->partition_root_state && !force && + cpumask_equal(tmp->new_cpus, cp->effective_cpus) && + (!cpuset_v2() || + (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; } +update_parent_effective: /* - * update_parent_subparts_cpumask() should have been called + * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call - * update_tasks_cpumask() again for tasks in the parent - * cpuset if the parent's subparts_cpus changes. + * cpuset_update_tasks_cpumask() again for tasks in the parent + * cpuset if the parent's effective_cpus changes. */ - if ((cp != cs) && cp->partition_root_state) { + if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { - case PRS_DISABLED: - /* - * If parent is not a partition root or an - * invalid partition root, clear the state - * state and the CS_CPU_EXCLUSIVE flag. - */ - WARN_ON_ONCE(cp->partition_root_state - != PRS_ERROR); - cp->partition_root_state = 0; - - /* - * clear_bit() is an atomic operation and - * readers aren't interested in the state - * of CS_CPU_EXCLUSIVE anyway. So we can - * just update the flag without holding - * the callback_lock. - */ - clear_bit(CS_CPU_EXCLUSIVE, &cp->flags); - break; - - case PRS_ENABLED: - if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp)) - update_tasks_cpumask(parent); + case PRS_ROOT: + case PRS_ISOLATED: + update_parent = true; break; - case PRS_ERROR: + default: /* - * When parent is invalid, it has to be too. + * When parent is not a partition root or is + * invalid, child partition roots become + * invalid too. */ - cp->partition_root_state = PRS_ERROR; - if (cp->nr_subparts_cpus) { - cp->nr_subparts_cpus = 0; - cpumask_clear(cp->subparts_cpus); - } + if (is_partition_valid(cp)) + new_prs = -cp->partition_root_state; + WRITE_ONCE(cp->prs_err, + is_partition_invalid(parent) + ? PERR_INVPARENT : PERR_NOTPART); break; } } - +get_css: if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); - spin_lock_irq(&callback_lock); - - cpumask_copy(cp->effective_cpus, tmp->new_cpus); - if (cp->nr_subparts_cpus && - (cp->partition_root_state != PRS_ENABLED)) { - cp->nr_subparts_cpus = 0; - cpumask_clear(cp->subparts_cpus); - } else if (cp->nr_subparts_cpus) { + if (update_parent) { + update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp); /* - * Make sure that effective_cpus & subparts_cpus - * are mutually exclusive. - * - * In the unlikely event that effective_cpus - * becomes empty. we clear cp->nr_subparts_cpus and - * let its child partition roots to compete for - * CPUs again. + * The cpuset partition_root_state may become + * invalid. Capture it. */ - cpumask_andnot(cp->effective_cpus, cp->effective_cpus, - cp->subparts_cpus); - if (cpumask_empty(cp->effective_cpus)) { - cpumask_copy(cp->effective_cpus, tmp->new_cpus); - cpumask_clear(cp->subparts_cpus); - cp->nr_subparts_cpus = 0; - } else if (!cpumask_subset(cp->subparts_cpus, - tmp->new_cpus)) { - cpumask_andnot(cp->subparts_cpus, - cp->subparts_cpus, tmp->new_cpus); - cp->nr_subparts_cpus - = cpumask_weight(cp->subparts_cpus); - } + new_prs = cp->partition_root_state; } + + spin_lock_irq(&callback_lock); + cpumask_copy(cp->effective_cpus, tmp->new_cpus); + cp->partition_root_state = new_prs; + if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) + compute_excpus(cp, cp->effective_xcpus); + + /* + * Make sure effective_xcpus is properly set for a valid + * partition root. + */ + if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) + cpumask_and(cp->effective_xcpus, + cp->cpus_allowed, parent->effective_xcpus); + else if (new_prs < 0) + reset_partition_data(cp); spin_unlock_irq(&callback_lock); + notify_partition_change(cp, old_prs); + WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp); + cpuset_update_tasks_cpumask(cp, cp->effective_cpus); + + /* + * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE + * from parent if current cpuset isn't a valid partition root + * and their load balance states differ. + */ + if (cpuset_v2() && !is_partition_valid(cp) && + (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { + if (is_sched_load_balance(parent)) + set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + } /* * On legacy hierarchy, if the effective cpumask of any non- @@ -1390,17 +2369,13 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) */ if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - is_partition_root(cp))) - need_rebuild_sched_domains = true; + (!cpuset_v2() || is_partition_valid(cp))) + cpuset_force_rebuild(); rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); - - if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); } /** @@ -1415,23 +2390,165 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; + lockdep_assert_held(&cpuset_mutex); + /* * Check all its siblings and call update_cpumasks_hier() - * if their use_parent_ecpus flag is set in order for them - * to use the right effective_cpus value. + * if their effective_cpus will need to be changed. + * + * It is possible a change in parent's effective_cpus + * due to a change in a child partition's effective_xcpus will impact + * its siblings even if they do not inherit parent's effective_cpus + * directly. + * + * The update_cpumasks_hier() function may sleep. So we have to + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs) continue; - if (!sibling->use_parent_ecpus) + if (!is_partition_valid(sibling)) { + compute_effective_cpumask(tmp->new_cpus, sibling, + parent); + if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) + continue; + } else if (is_remote_partition(sibling)) { + /* + * Change in a sibling cpuset won't affect a remote + * partition root. + */ continue; + } - update_cpumasks_hier(sibling, tmp); + if (!css_tryget_online(&sibling->css)) + continue; + + rcu_read_unlock(); + update_cpumasks_hier(sibling, tmp, false); + rcu_read_lock(); + css_put(&sibling->css); } rcu_read_unlock(); } +static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) +{ + int retval; + + retval = cpulist_parse(buf, out_mask); + if (retval < 0) + return retval; + if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed)) + return -EINVAL; + + return 0; +} + +/** + * validate_partition - Validate a cpuset partition configuration + * @cs: The cpuset to validate + * @trialcs: The trial cpuset containing proposed configuration changes + * + * If any validation check fails, the appropriate error code is set in the + * cpuset's prs_err field. + * + * Return: PRS error code (0 if valid, non-zero error code if invalid) + */ +static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) +{ + struct cpuset *parent = parent_cs(cs); + + if (cs_is_member(trialcs)) + return PERR_NONE; + + if (cpumask_empty(trialcs->effective_xcpus)) + return PERR_INVCPUS; + + if (prstate_housekeeping_conflict(trialcs->partition_root_state, + trialcs->effective_xcpus)) + return PERR_HKEEPING; + + if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) + return PERR_NOCPUS; + + return PERR_NONE; +} + +static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + int retval; + struct cpuset *parent = parent_cs(cs); + + retval = validate_change(cs, trialcs); + + if ((retval == -EINVAL) && cpuset_v2()) { + struct cgroup_subsys_state *css; + struct cpuset *cp; + + /* + * The -EINVAL error code indicates that partition sibling + * CPU exclusivity rule has been violated. We still allow + * the cpumask change to proceed while invalidating the + * partition. However, any conflicting sibling partitions + * have to be marked as invalid too. + */ + trialcs->prs_err = PERR_NOTEXCL; + rcu_read_lock(); + cpuset_for_each_child(cp, css, parent) { + struct cpumask *xcpus = user_xcpus(trialcs); + + if (is_partition_valid(cp) && + cpumask_intersects(xcpus, cp->effective_xcpus)) { + rcu_read_unlock(); + update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); + rcu_read_lock(); + } + } + rcu_read_unlock(); + retval = 0; + } + return retval; +} + +/** + * partition_cpus_change - Handle partition state changes due to CPU mask updates + * @cs: The target cpuset being modified + * @trialcs: The trial cpuset containing proposed configuration changes + * @tmp: Temporary masks for intermediate calculations + * + * This function handles partition state transitions triggered by CPU mask changes. + * CPU modifications may cause a partition to be disabled or require state updates. + */ +static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + enum prs_errcode prs_err; + + if (cs_is_member(cs)) + return; + + prs_err = validate_partition(cs, trialcs); + if (prs_err) + trialcs->prs_err = cs->prs_err = prs_err; + + if (is_remote_partition(cs)) { + if (trialcs->prs_err) + remote_partition_disable(cs, tmp); + else + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, tmp); + } else { + if (trialcs->prs_err) + update_parent_effective_cpumask(cs, partcmd_invalidate, + NULL, tmp); + else + update_parent_effective_cpumask(cs, partcmd_update, + trialcs->effective_xcpus, tmp); + } +} + /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider @@ -1443,81 +2560,120 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; + bool force = false; + int old_prs = cs->partition_root_state; - /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ - if (cs == &top_cpuset) - return -EACCES; - - /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. - * Since cpulist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have cpus. - */ - if (!*buf) { - cpumask_clear(trialcs->cpus_allowed); - } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); - if (retval < 0) - return retval; - - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) - return -EINVAL; - } + retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); + if (retval < 0) + return retval; /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - retval = validate_change(cs, trialcs); + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; + + compute_trialcs_excpus(trialcs, cs); + trialcs->prs_err = PERR_NONE; + + retval = cpus_allowed_validate_change(cs, trialcs, &tmp); if (retval < 0) - return retval; + goto out_free; -#ifdef CONFIG_CPUMASK_OFFSTACK /* - * Use the cpumasks in trialcs for tmpmasks when they are pointers - * to allocated cpumasks. + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. */ - tmp.addmask = trialcs->subparts_cpus; - tmp.delmask = trialcs->effective_cpus; - tmp.new_cpus = trialcs->cpus_allowed; -#endif + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); - if (cs->partition_root_state) { - /* Cpumask of a partition root cannot be empty */ - if (cpumask_empty(trialcs->cpus_allowed)) - return -EINVAL; - if (update_parent_subparts_cpumask(cs, partcmd_update, - trialcs->cpus_allowed, &tmp) < 0) - return -EINVAL; - } + partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); + if ((old_prs > 0) && !is_partition_valid(cs)) + reset_partition_data(cs); + spin_unlock_irq(&callback_lock); + + /* effective_cpus/effective_xcpus will be updated here */ + update_cpumasks_hier(cs, &tmp, force); + + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ + if (cs->partition_root_state) + update_partition_sd_lb(cs, old_prs); +out_free: + free_tmpmasks(&tmp); + return retval; +} + +/** + * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + * + * The tasks' cpumask will be updated if cs is a valid partition root. + */ +static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + struct tmpmasks tmp; + bool force = false; + int old_prs = cs->partition_root_state; + + retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus); + if (retval < 0) + return retval; + + /* Nothing to do if the CPUs didn't change */ + if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) + return 0; /* - * Make sure that subparts_cpus is a subset of cpus_allowed. + * Reject the change if there is exclusive CPUs conflict with + * the siblings. */ - if (cs->nr_subparts_cpus) { - cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); - cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); - } + if (compute_trialcs_excpus(trialcs, cs)) + return -EINVAL; + + /* + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. + */ + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + + retval = validate_change(cs, trialcs); + if (retval) + return retval; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; + + trialcs->prs_err = PERR_NONE; + partition_cpus_change(cs, trialcs, &tmp); + + spin_lock_irq(&callback_lock); + cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); + cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); + if ((old_prs > 0) && !is_partition_valid(cs)) + reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_cpumasks_hier(cs, &tmp); + /* + * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus + * of the subtree when it is a valid partition root or effective_xcpus + * is updated. + */ + if (is_partition_valid(cs) || force) + update_cpumasks_hier(cs, &tmp, force); - if (cs->partition_root_state) { - struct cpuset *parent = parent_cs(cs); + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ + if (cs->partition_root_state) + update_partition_sd_lb(cs, old_prs); - /* - * For partition root, update the cpumasks of sibling - * cpusets if they use parent's effective_cpus. - */ - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmp); - } + free_tmpmasks(&tmp); return 0; } @@ -1552,6 +2708,11 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, { struct cpuset_migrate_mm_work *mwork; + if (nodes_equal(*from, *to)) { + mmput(mm); + return; + } + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); if (mwork) { mwork->mm = mm; @@ -1564,9 +2725,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -static void cpuset_post_attach(void) +static void flush_migrate_mm_task_workfn(struct callback_head *head) { flush_workqueue(cpuset_migrate_mm_wq); + kfree(head); +} + +static void schedule_flush_migrate_mm(void) +{ + struct callback_head *flush_cb; + + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); + if (!flush_cb) + return; + + init_task_work(flush_cb, flush_migrate_mm_task_workfn); + + if (task_work_add(current, flush_cb, TWA_RESUME)) + kfree(flush_cb); } /* @@ -1600,14 +2776,14 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, static void *cpuset_being_rebound; /** - * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. + * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ -static void update_tasks_nodemask(struct cpuset *cs) +void cpuset_update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; @@ -1618,7 +2794,7 @@ static void update_tasks_nodemask(struct cpuset *cs) guarantee_online_mems(cs, &newmems); /* - * The mpol_rebind_mm() call takes mmap_sem, which we couldn't + * The mpol_rebind_mm() call takes mmap_lock, which we couldn't * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold @@ -1666,7 +2842,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * When configured nodemask is changed, the effective nodemasks of this cpuset * and all its descendants need to be updated. * - * On legacy hiearchy, effective_mems will be the same with mems_allowed. + * On legacy hierarchy, effective_mems will be the same with mems_allowed. * * Called with cpuset_mutex held */ @@ -1705,7 +2881,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); - update_tasks_nodemask(cp); + cpuset_update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); @@ -1723,7 +2899,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, - * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, @@ -1732,41 +2908,26 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, int retval; /* - * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; - * it's read-only - */ - if (cs == &top_cpuset) { - retval = -EACCES; - goto done; - } - - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. - * Since nodelist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have memory. + * The validate_change() call ensures that cpusets with tasks have memory. */ - if (!*buf) { - nodes_clear(trialcs->mems_allowed); - } else { - retval = nodelist_parse(buf, trialcs->mems_allowed); - if (retval < 0) - goto done; + retval = nodelist_parse(buf, trialcs->mems_allowed); + if (retval < 0) + return retval; - if (!nodes_subset(trialcs->mems_allowed, - top_cpuset.mems_allowed)) { - retval = -EINVAL; - goto done; - } - } + if (!nodes_subset(trialcs->mems_allowed, + top_cpuset.mems_allowed)) + return -EINVAL; + + /* No change? nothing to do */ + if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) + return 0; - if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { - retval = 0; /* Too easy - nothing to do */ - goto done; - } retval = validate_change(cs, trialcs); if (retval < 0) - goto done; + return retval; + + check_insane_mems_config(&trialcs->mems_allowed); spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; @@ -1774,8 +2935,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &trialcs->mems_allowed); -done: - return retval; + return 0; } bool current_cpuset_is_being_rebound(void) @@ -1789,44 +2949,8 @@ bool current_cpuset_is_being_rebound(void) return ret; } -static int update_relax_domain_level(struct cpuset *cs, s64 val) -{ -#ifdef CONFIG_SMP - if (val < -1 || val >= sched_domain_level_max) - return -EINVAL; -#endif - - if (val != cs->relax_domain_level) { - cs->relax_domain_level = val; - if (!cpumask_empty(cs->cpus_allowed) && - is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); - } - - return 0; -} - -/** - * update_tasks_flags - update the spread flags of tasks in the cpuset. - * @cs: the cpuset in which each task's spread flags needs to be changed - * - * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_mutex held, cpuset membership stays - * stable. - */ -static void update_tasks_flags(struct cpuset *cs) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) - cpuset_update_task_spread_flag(cs, task); - css_task_iter_end(&it); -} - /* - * update_flag - read a 0 or a 1 in a file and update associated flag + * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared @@ -1834,7 +2958,7 @@ static void update_tasks_flags(struct cpuset *cs) * Call with cpuset_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { struct cpuset *trialcs; @@ -1842,7 +2966,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int spread_flag_changed; int err; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) return -ENOMEM; @@ -1865,242 +2989,242 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); - if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { + if (cpuset_v2()) + cpuset_force_rebuild(); + else + rebuild_sched_domains_locked(); + } if (spread_flag_changed) - update_tasks_flags(cs); + cpuset1_update_tasks_flags(cs); out: free_cpuset(trialcs); return err; } -/* - * update_prstate - update partititon_root_state - * cs: the cpuset to update - * val: 0 - disabled, 1 - enabled +/** + * update_prstate - update partition_root_state + * @cs: the cpuset to update + * @new_prs: new partition root state + * Return: 0 if successful, != 0 if error * * Call with cpuset_mutex held. */ -static int update_prstate(struct cpuset *cs, int val) +static int update_prstate(struct cpuset *cs, int new_prs) { - int err; + int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); - struct tmpmasks tmp; + struct tmpmasks tmpmask; + bool isolcpus_updated = false; - if ((val != 0) && (val != 1)) - return -EINVAL; - if (val == cs->partition_root_state) + if (old_prs == new_prs) return 0; /* - * Cannot force a partial or invalid partition root to a full - * partition root. + * Treat a previously invalid partition root as if it is a "member". */ - if (val && cs->partition_root_state) - return -EINVAL; + if (new_prs && is_partition_invalid(cs)) + old_prs = PRS_MEMBER; - if (alloc_cpumasks(NULL, &tmp)) + if (alloc_tmpmasks(&tmpmask)) return -ENOMEM; - err = -EINVAL; - if (!cs->partition_root_state) { + err = update_partition_exclusive_flag(cs, new_prs); + if (err) + goto out; + + if (!old_prs) { /* - * Turning on partition root requires setting the - * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed - * cannot be NULL. + * cpus_allowed and exclusive_cpus cannot be both empty. */ - if (cpumask_empty(cs->cpus_allowed)) - goto out; - - err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); - if (err) - goto out; - - err = update_parent_subparts_cpumask(cs, partcmd_enable, - NULL, &tmp); - if (err) { - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + if (xcpus_empty(cs)) { + err = PERR_CPUSEMPTY; goto out; } - cs->partition_root_state = PRS_ENABLED; - } else { + /* - * Turning off partition root will clear the - * CS_CPU_EXCLUSIVE bit. + * We don't support the creation of a new local partition with + * a remote partition underneath it. This unsupported + * setting can happen only if parent is the top_cpuset because + * a remote partition cannot be created underneath an existing + * local or remote partition. */ - if (cs->partition_root_state == PRS_ERROR) { - cs->partition_root_state = 0; - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - err = 0; + if ((parent == &top_cpuset) && + cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) { + err = PERR_REMOTE; goto out; } - err = update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, &tmp); - if (err) - goto out; + /* + * If parent is valid partition, enable local partiion. + * Otherwise, enable a remote partition. + */ + if (is_partition_valid(parent)) { + enum partition_cmd cmd = (new_prs == PRS_ROOT) + ? partcmd_enable : partcmd_enablei; - cs->partition_root_state = 0; + err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); + } else { + err = remote_partition_enable(cs, new_prs, &tmpmask); + } + } else if (old_prs && new_prs) { + /* + * A change in load balance state only, no change in cpumasks. + * Need to update isolated_cpus. + */ + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(cs->effective_xcpus, NULL)) || + prstate_housekeeping_conflict(new_prs, cs->effective_xcpus)) + err = PERR_HKEEPING; + else + isolcpus_updated = true; + } else { + /* + * Switching back to member is always allowed even if it + * disables child partitions. + */ + if (is_remote_partition(cs)) + remote_partition_disable(cs, &tmpmask); + else + update_parent_effective_cpumask(cs, partcmd_disable, + NULL, &tmpmask); - /* Turning off CS_CPU_EXCLUSIVE will not return error */ - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + /* + * Invalidation of child partitions will be done in + * update_cpumasks_hier(). + */ } - +out: /* - * Update cpumask of parent's tasks except when it is the top - * cpuset as some system daemons cannot be mapped to other CPUs. + * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error + * happens. */ - if (parent != &top_cpuset) - update_tasks_cpumask(parent); + if (err) { + new_prs = -new_prs; + update_partition_exclusive_flag(cs, new_prs); + } - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmp); + spin_lock_irq(&callback_lock); + cs->partition_root_state = new_prs; + WRITE_ONCE(cs->prs_err, err); + if (!is_partition_valid(cs)) + reset_partition_data(cs); + else if (isolcpus_updated) + isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); + spin_unlock_irq(&callback_lock); + update_isolation_cpumasks(); - rebuild_sched_domains_locked(); -out: - free_cpumasks(NULL, &tmp); - return err; -} + /* Force update if switching back to member & update effective_xcpus */ + update_cpumasks_hier(cs, &tmpmask, !new_prs); -/* - * Frequency meter - How fast is some event occurring? - * - * These routines manage a digitally filtered, constant time based, - * event frequency meter. There are four routines: - * fmeter_init() - initialize a frequency meter. - * fmeter_markevent() - called each time the event happens. - * fmeter_getrate() - returns the recent rate of such events. - * fmeter_update() - internal routine used to update fmeter. - * - * A common data structure is passed to each of these routines, - * which is used to keep track of the state required to manage the - * frequency meter and its digital filter. - * - * The filter works on the number of events marked per unit time. - * The filter is single-pole low-pass recursive (IIR). The time unit - * is 1 second. Arithmetic is done using 32-bit integers scaled to - * simulate 3 decimal digits of precision (multiplied by 1000). - * - * With an FM_COEF of 933, and a time base of 1 second, the filter - * has a half-life of 10 seconds, meaning that if the events quit - * happening, then the rate returned from the fmeter_getrate() - * will be cut in half each 10 seconds, until it converges to zero. - * - * It is not worth doing a real infinitely recursive filter. If more - * than FM_MAXTICKS ticks have elapsed since the last filter event, - * just compute FM_MAXTICKS ticks worth, by which point the level - * will be stable. - * - * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid - * arithmetic overflow in the fmeter_update() routine. - * - * Given the simple 32 bit integer arithmetic used, this meter works - * best for reporting rates between one per millisecond (msec) and - * one per 32 (approx) seconds. At constant rates faster than one - * per msec it maxes out at values just under 1,000,000. At constant - * rates between one per msec, and one per second it will stabilize - * to a value N*1000, where N is the rate of events per second. - * At constant rates between one per second and one per 32 seconds, - * it will be choppy, moving up on the seconds that have an event, - * and then decaying until the next event. At rates slower than - * about one in 32 seconds, it decays all the way back to zero between - * each event. - */ - -#define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ -#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ -#define FM_SCALE 1000 /* faux fixed point scale */ - -/* Initialize a frequency meter */ -static void fmeter_init(struct fmeter *fmp) -{ - fmp->cnt = 0; - fmp->val = 0; - fmp->time = 0; - spin_lock_init(&fmp->lock); -} - -/* Internal meter update - process cnt events and update value */ -static void fmeter_update(struct fmeter *fmp) -{ - time64_t now; - u32 ticks; - - now = ktime_get_seconds(); - ticks = now - fmp->time; - - if (ticks == 0) - return; + /* A newly created partition must have effective_xcpus set */ + WARN_ON_ONCE(!old_prs && (new_prs > 0) + && cpumask_empty(cs->effective_xcpus)); - ticks = min(FM_MAXTICKS, ticks); - while (ticks-- > 0) - fmp->val = (FM_COEF * fmp->val) / FM_SCALE; - fmp->time = now; + /* Update sched domains and load balance flag */ + update_partition_sd_lb(cs, old_prs); - fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; - fmp->cnt = 0; + notify_partition_change(cs, old_prs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); + free_tmpmasks(&tmpmask); + return 0; } -/* Process any previous ticks, then bump cnt by one (times scale). */ -static void fmeter_markevent(struct fmeter *fmp) +static struct cpuset *cpuset_attach_old_cs; + +/* + * Check to see if a cpuset can accept a new task + * For v1, cpus_allowed and mems_allowed can't be empty. + * For v2, effective_cpus can't be empty. + * Note that in v1, effective_cpus = cpus_allowed. + */ +static int cpuset_can_attach_check(struct cpuset *cs) { - spin_lock(&fmp->lock); - fmeter_update(fmp); - fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); - spin_unlock(&fmp->lock); + if (cpumask_empty(cs->effective_cpus) || + (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) + return -ENOSPC; + return 0; } -/* Process any previous ticks, then return current value. */ -static int fmeter_getrate(struct fmeter *fmp) +static void reset_migrate_dl_data(struct cpuset *cs) { - int val; - - spin_lock(&fmp->lock); - fmeter_update(fmp); - val = fmp->val; - spin_unlock(&fmp->lock); - return val; + cs->nr_migrate_dl_tasks = 0; + cs->sum_migrate_dl_bw = 0; } -static struct cpuset *cpuset_attach_old_cs; - /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; - struct cpuset *cs; + struct cpuset *cs, *oldcs; struct task_struct *task; + bool cpus_updated, mems_updated; int ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + oldcs = cpuset_attach_old_cs; cs = css_cs(css); mutex_lock(&cpuset_mutex); - /* allow moving tasks into an empty cpuset if on default hierarchy */ - ret = -ENOSPC; - if (!is_in_v2_mode() && - (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) goto out_unlock; + cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + cgroup_taskset_for_each(task, css, tset) { - ret = task_can_attach(task, cs->cpus_allowed); + ret = task_can_attach(task); if (ret) goto out_unlock; - ret = security_task_setscheduler(task); - if (ret) + + /* + * Skip rights over task check in v2 when nothing changes, + * migration permission derives from hierarchy ownership in + * cgroup_procs_write_permission()). + */ + if (!cpuset_v2() || (cpus_updated || mems_updated)) { + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + } + + if (dl_task(task)) { + cs->nr_migrate_dl_tasks++; + cs->sum_migrate_dl_bw += task->dl.dl_bw; + } + } + + if (!cs->nr_migrate_dl_tasks) + goto out_success; + + if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { + int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + + if (unlikely(cpu >= nr_cpu_ids)) { + reset_migrate_dl_data(cs); + ret = -EINVAL; + goto out_unlock; + } + + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) { + reset_migrate_dl_data(cs); goto out_unlock; + } } +out_success: /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; - ret = 0; out_unlock: mutex_unlock(&cpuset_mutex); return ret; @@ -2109,60 +3233,96 @@ out_unlock: static void cpuset_cancel_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; + struct cpuset *cs; cgroup_taskset_first(tset, &css); + cs = css_cs(css); mutex_lock(&cpuset_mutex); - css_cs(css)->attach_in_progress--; + dec_attach_in_progress_locked(cs); + + if (cs->nr_migrate_dl_tasks) { + int cpu = cpumask_any(cs->effective_cpus); + + dl_bw_free(cpu, cs->sum_migrate_dl_bw); + reset_migrate_dl_data(cs); + } + mutex_unlock(&cpuset_mutex); } /* - * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_to; + +static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) +{ + lockdep_assert_held(&cpuset_mutex); + + if (cs != &top_cpuset) + guarantee_active_cpus(task, cpus_attach); + else + cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), + subpartitions_cpus); + /* + * can_attach beforehand should guarantee that this doesn't + * fail. TODO: have a better way to handle failure here + */ + WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + + cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); + cpuset1_update_task_spread_flags(cs, task); +} static void cpuset_attach(struct cgroup_taskset *tset) { - /* static buf protected by cpuset_mutex */ - static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; struct cgroup_subsys_state *css; struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + bool cpus_updated, mems_updated; + bool queue_task_work = false; cgroup_taskset_first(tset, &css); cs = css_cs(css); + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ mutex_lock(&cpuset_mutex); + cpus_updated = !cpumask_equal(cs->effective_cpus, + oldcs->effective_cpus); + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); - /* prepare for attach */ - if (cs == &top_cpuset) - cpumask_copy(cpus_attach, cpu_possible_mask); - else - guarantee_online_cpus(cs, cpus_attach); + /* + * In the default hierarchy, enabling cpuset in the child cgroups + * will trigger a number of cpuset_attach() calls with no change + * in effective cpus and mems. In that case, we can optimize out + * by skipping the task iteration and update. + */ + if (cpuset_v2() && !cpus_updated && !mems_updated) { + cpuset_attach_nodemask_to = cs->effective_mems; + goto out; + } guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css, tset) { - /* - * can_attach beforehand should guarantee that this doesn't - * fail. TODO: have a better way to handle failure here - */ - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); - - cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flag(cs, task); - } + cgroup_taskset_for_each(task, css, tset) + cpuset_attach_task(cs, task); /* * Change mm for all threadgroup leaders. This is expensive and may - * sleep and should be moved outside migration path proper. + * sleep and should be moved outside migration path proper. Skip it + * if there is no change in effective_mems and CS_MEMORY_MIGRATE is + * not set. */ cpuset_attach_nodemask_to = cs->effective_mems; + if (!is_memory_migrate(cs) && !mems_updated) + goto out; + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); @@ -2177,155 +3337,51 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) + if (is_memory_migrate(cs)) { cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - else + queue_task_work = true; + } else mmput(mm); } } +out: + if (queue_task_work) + schedule_flush_migrate_mm(); cs->old_mems_allowed = cpuset_attach_nodemask_to; - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - - mutex_unlock(&cpuset_mutex); -} - -/* The various types of files and directories in a cpuset file system */ - -typedef enum { - FILE_MEMORY_MIGRATE, - FILE_CPULIST, - FILE_MEMLIST, - FILE_EFFECTIVE_CPULIST, - FILE_EFFECTIVE_MEMLIST, - FILE_SUBPARTS_CPULIST, - FILE_CPU_EXCLUSIVE, - FILE_MEM_EXCLUSIVE, - FILE_MEM_HARDWALL, - FILE_SCHED_LOAD_BALANCE, - FILE_PARTITION_ROOT, - FILE_SCHED_RELAX_DOMAIN_LEVEL, - FILE_MEMORY_PRESSURE_ENABLED, - FILE_MEMORY_PRESSURE, - FILE_SPREAD_PAGE, - FILE_SPREAD_SLAB, -} cpuset_filetype_t; - -static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = 0; - - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) { - retval = -ENODEV; - goto out_unlock; + if (cs->nr_migrate_dl_tasks) { + cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; + oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; + reset_migrate_dl_data(cs); } - switch (type) { - case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); - break; - case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); - break; - case FILE_MEM_HARDWALL: - retval = update_flag(CS_MEM_HARDWALL, cs, val); - break; - case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); - break; - case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, val); - break; - case FILE_MEMORY_PRESSURE_ENABLED: - cpuset_memory_pressure_enabled = !!val; - break; - case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, val); - break; - case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: - mutex_unlock(&cpuset_mutex); - return retval; -} - -static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, - s64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = -ENODEV; - - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) - goto out_unlock; + dec_attach_in_progress_locked(cs); - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - retval = update_relax_domain_level(cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: mutex_unlock(&cpuset_mutex); - return retval; } /* * Common handling for a write to a "cpus" or "mems" file. */ -static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); struct cpuset *trialcs; int retval = -ENODEV; - buf = strstrip(buf); - - /* - * CPU or memory hotunplug may leave @cs w/o any execution - * resources, in which case the hotplug code asynchronously updates - * configuration and transfers all tasks to the nearest ancestor - * which can execute. - * - * As writes to "cpus" or "mems" may restore @cs's execution - * resources, wait for the previously scheduled operations before - * proceeding, so that we don't end up keep removing tasks added - * after execution capability is restored. - * - * cpuset_hotplug_work calls back into cgroup core via - * cgroup_transfer_tasks() and waiting for it from a cgroupfs - * operation like this one can lead to a deadlock through kernfs - * active_ref protection. Let's break the protection. Losing the - * protection is okay as we check whether @cs is online after - * grabbing cpuset_mutex anyway. This only happens on the legacy - * hierarchies. - */ - css_get(&cs->css); - kernfs_break_active_protection(of->kn); - flush_work(&cpuset_hotplug_work); + /* root is read-only */ + if (cs == &top_cpuset) + return -EACCES; - mutex_lock(&cpuset_mutex); + buf = strstrip(buf); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) { retval = -ENOMEM; goto out_unlock; @@ -2335,6 +3391,9 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_CPULIST: retval = update_cpumask(cs, trialcs, buf); break; + case FILE_EXCLUSIVE_CPULIST: + retval = update_exclusive_cpumask(cs, trialcs, buf); + break; case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; @@ -2344,11 +3403,12 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, } free_cpuset(trialcs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); out_unlock: - mutex_unlock(&cpuset_mutex); - kernfs_unbreak_active_protection(of->kn); - css_put(&cs->css); - flush_workqueue(cpuset_migrate_mm_wq); + cpuset_full_unlock(); + if (of_cft(of)->private == FILE_MEMLIST) + schedule_flush_migrate_mm(); return retval ?: nbytes; } @@ -2360,7 +3420,7 @@ out_unlock: * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. */ -static int cpuset_common_seq_show(struct seq_file *sf, void *v) +int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; @@ -2381,8 +3441,17 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; + case FILE_EXCLUSIVE_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); + break; + case FILE_EFFECTIVE_XCPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); + break; case FILE_SUBPARTS_CPULIST: - seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); + break; + case FILE_ISOLATED_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); break; default: ret = -EINVAL; @@ -2392,71 +3461,38 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) return ret; } -static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_CPU_EXCLUSIVE: - return is_cpu_exclusive(cs); - case FILE_MEM_EXCLUSIVE: - return is_mem_exclusive(cs); - case FILE_MEM_HARDWALL: - return is_mem_hardwall(cs); - case FILE_SCHED_LOAD_BALANCE: - return is_sched_load_balance(cs); - case FILE_MEMORY_MIGRATE: - return is_memory_migrate(cs); - case FILE_MEMORY_PRESSURE_ENABLED: - return cpuset_memory_pressure_enabled; - case FILE_MEMORY_PRESSURE: - return fmeter_getrate(&cs->fmeter); - case FILE_SPREAD_PAGE: - return is_spread_page(cs); - case FILE_SPREAD_SLAB: - return is_spread_slab(cs); - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - -static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - return cs->relax_domain_level; - default: - BUG(); - } - - /* Unrechable but makes gcc happy */ - return 0; -} - -static int sched_partition_show(struct seq_file *seq, void *v) +static int cpuset_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); + const char *err, *type = NULL; switch (cs->partition_root_state) { - case PRS_ENABLED: + case PRS_ROOT: seq_puts(seq, "root\n"); break; - case PRS_DISABLED: + case PRS_ISOLATED: + seq_puts(seq, "isolated\n"); + break; + case PRS_MEMBER: seq_puts(seq, "member\n"); break; - case PRS_ERROR: - seq_puts(seq, "root invalid\n"); + case PRS_INVALID_ROOT: + type = "root"; + fallthrough; + case PRS_INVALID_ISOLATED: + if (!type) + type = "isolated"; + err = perr_strings[READ_ONCE(cs->prs_err)]; + if (err) + seq_printf(seq, "%s invalid (%s)\n", type, err); + else + seq_printf(seq, "%s invalid\n", type); break; } return 0; } -static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, +static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); @@ -2465,39 +3501,34 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, buf = strstrip(buf); - /* - * Convert "root" to ENABLED, and convert "member" to DISABLED. - */ if (!strcmp(buf, "root")) - val = PRS_ENABLED; + val = PRS_ROOT; else if (!strcmp(buf, "member")) - val = PRS_DISABLED; + val = PRS_MEMBER; + else if (!strcmp(buf, "isolated")) + val = PRS_ISOLATED; else return -EINVAL; - css_get(&cs->css); - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) - goto out_unlock; - - retval = update_prstate(cs, val); -out_unlock: - mutex_unlock(&cpuset_mutex); - css_put(&cs->css); + cpuset_full_lock(); + if (is_cpuset_online(cs)) + retval = update_prstate(cs, val); + cpuset_full_unlock(); return retval ?: nbytes; } /* - * for the common functions, 'private' gives the type of file + * This is currently a minimal set for the default hierarchy. It can be + * expanded later on by migrating more features and control files from v1. */ - -static struct cftype legacy_files[] = { +static struct cftype dfl_files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, + .flags = CFTYPE_NOT_ON_ROOT, }, { @@ -2506,152 +3537,73 @@ static struct cftype legacy_files[] = { .write = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, + .flags = CFTYPE_NOT_ON_ROOT, }, { - .name = "effective_cpus", + .name = "cpus.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_CPULIST, }, { - .name = "effective_mems", + .name = "mems.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_MEMLIST, }, { - .name = "cpu_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_CPU_EXCLUSIVE, - }, - - { - .name = "mem_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_EXCLUSIVE, - }, - - { - .name = "mem_hardwall", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_HARDWALL, - }, - - { - .name = "sched_load_balance", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SCHED_LOAD_BALANCE, - }, - - { - .name = "sched_relax_domain_level", - .read_s64 = cpuset_read_s64, - .write_s64 = cpuset_write_s64, - .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, - }, - - { - .name = "memory_migrate", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_MIGRATE, - }, - - { - .name = "memory_pressure", - .read_u64 = cpuset_read_u64, - .private = FILE_MEMORY_PRESSURE, - }, - - { - .name = "memory_spread_page", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_PAGE, - }, - - { - .name = "memory_spread_slab", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_SLAB, - }, - - { - .name = "memory_pressure_enabled", - .flags = CFTYPE_ONLY_ON_ROOT, - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE_ENABLED, + .name = "cpus.partition", + .seq_show = cpuset_partition_show, + .write = cpuset_partition_write, + .private = FILE_PARTITION_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct cpuset, partition_file), }, - { } /* terminate */ -}; - -/* - * This is currently a minimal set for the default hierarchy. It can be - * expanded later on by migrating more features and control files from v1. - */ -static struct cftype dfl_files[] = { { - .name = "cpus", + .name = "cpus.exclusive", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), - .private = FILE_CPULIST, + .private = FILE_EXCLUSIVE_CPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { - .name = "mems", + .name = "cpus.exclusive.effective", .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * MAX_NUMNODES), - .private = FILE_MEMLIST, + .private = FILE_EFFECTIVE_XCPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { - .name = "cpus.effective", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_CPULIST, - }, - - { - .name = "mems.effective", + .name = "cpus.subpartitions", .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_MEMLIST, - }, - - { - .name = "cpus.partition", - .seq_show = sched_partition_show, - .write = sched_partition_write, - .private = FILE_PARTITION_ROOT, - .flags = CFTYPE_NOT_ON_ROOT, + .private = FILE_SUBPARTS_CPULIST, + .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, }, { - .name = "cpus.subpartitions", + .name = "cpus.isolated", .seq_show = cpuset_common_seq_show, - .private = FILE_SUBPARTS_CPULIST, - .flags = CFTYPE_DEBUG, + .private = FILE_ISOLATED_CPULIST, + .flags = CFTYPE_ONLY_ON_ROOT, }, { } /* terminate */ }; -/* - * cpuset_css_alloc - allocate a cpuset css - * cgrp: control group that the new cpuset will be part of +/** + * cpuset_css_alloc - Allocate a cpuset css + * @parent_css: Parent css of the control group that the new cpuset will be + * part of + * Return: cpuset css on success, -ENOMEM on failure. + * + * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return + * top cpuset css otherwise. */ - static struct cgroup_subsys_state * cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -2660,21 +3612,18 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) return &top_cpuset.css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); + cs = dup_or_alloc_cpuset(NULL); if (!cs) return ERR_PTR(-ENOMEM); - if (alloc_cpumasks(cs, NULL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } - - set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - nodes_clear(cs->mems_allowed); - nodes_clear(cs->effective_mems); + __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; + /* Set CS_MEMORY_MIGRATE for default hierarchy */ + if (cpuset_v2()) + __set_bit(CS_MEMORY_MIGRATE, &cs->flags); + return &cs->css; } @@ -2688,13 +3637,16 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - mutex_lock(&cpuset_mutex); - - set_bit(CS_ONLINE, &cs->flags); + cpuset_full_lock(); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); + /* + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (cpuset_v2() && !is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); @@ -2702,8 +3654,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; - cs->use_parent_ecpus = true; - parent->child_ecpus_count++; } spin_unlock_irq(&callback_lock); @@ -2713,7 +3663,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is * set. This flag handling is implemented in cgroup core for - * histrical reasons - the flag may be specified during mount. + * historical reasons - the flag may be specified during mount. * * Currently, if any sibling cpusets have exclusive cpus or mem, we * refuse to clone the configuration - thereby refusing the task to @@ -2739,7 +3689,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_full_unlock(); return 0; } @@ -2749,35 +3699,33 @@ out_unlock: * will call rebuild_sched_domains_locked(). That is not needed * in the default hierarchy where only changes in partition * will cause repartitioning. - * - * If the cpuset has the 'sched.partition' flag enabled, simulate - * turning 'sched.partition" off. */ - static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - mutex_lock(&cpuset_mutex); - - if (is_partition_root(cs)) - update_prstate(cs, 0); - - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); - - cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } + cpuset_full_lock(); + if (!cpuset_v2() && is_sched_load_balance(cs)) + cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); - clear_bit(CS_ONLINE, &cs->flags); + cpuset_full_unlock(); +} - mutex_unlock(&cpuset_mutex); +/* + * If a dying cpuset has the 'cpus.partition' enabled, turn it off by + * changing it back to member to free its exclusive CPUs back to the pool to + * be used by other online cpusets. + */ +static void cpuset_css_killed(struct cgroup_subsys_state *css) +{ + struct cpuset *cs = css_cs(css); + + cpuset_full_lock(); + /* Reset valid partition back to member */ + if (is_partition_valid(cs)) + update_prstate(cs, PRS_MEMBER); + cpuset_full_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -2794,6 +3742,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); + cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { cpumask_copy(top_cpuset.cpus_allowed, @@ -2806,31 +3755,112 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) } /* + * In case the child is cloned into a cpuset different from its parent, + * additional checks are done to see if the move is allowed. + */ +static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + int ret; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return 0; + + lockdep_assert_held(&cgroup_mutex); + mutex_lock(&cpuset_mutex); + + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) + goto out_unlock; + + ret = task_can_attach(task); + if (ret) + goto out_unlock; + + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. + */ + cs->attach_in_progress++; +out_unlock: + mutex_unlock(&cpuset_mutex); + return ret; +} + +static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return; + + dec_attach_in_progress(cs); +} + +/* * Make sure the new task conform to the current state of its parent, * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ static void cpuset_fork(struct task_struct *task) { - if (task_css_is_root(task, cpuset_cgrp_id)) + struct cpuset *cs; + bool same_cs; + + rcu_read_lock(); + cs = task_cs(task); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) { + if (cs == &top_cpuset) + return; + + set_cpus_allowed_ptr(task, current->cpus_ptr); + task->mems_allowed = current->mems_allowed; return; + } - set_cpus_allowed_ptr(task, ¤t->cpus_allowed); - task->mems_allowed = current->mems_allowed; + /* CLONE_INTO_CGROUP */ + mutex_lock(&cpuset_mutex); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + cpuset_attach_task(cs, task); + + dec_attach_in_progress_locked(cs); + mutex_unlock(&cpuset_mutex); } struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, + .css_killed = cpuset_css_killed, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .can_fork = cpuset_can_fork, + .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, - .legacy_cftypes = legacy_files, +#ifdef CONFIG_CPUSETS_V1 + .legacy_cftypes = cpuset1_files, +#endif .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, @@ -2839,99 +3869,37 @@ struct cgroup_subsys cpuset_cgrp_subsys = { /** * cpuset_init - initialize cpusets at system boot * - * Description: Initialize top_cpuset and the cpuset internal file system, + * Description: Initialize top_cpuset **/ int __init cpuset_init(void) { - int err = 0; - BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); + cpumask_setall(top_cpuset.effective_xcpus); + cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); - top_cpuset.relax_domain_level = -1; - - err = register_filesystem(&cpuset_fs_type); - if (err < 0) - return err; BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); - return 0; -} - -/* - * If CPU and/or memory hotplug handlers, below, unplug any CPUs - * or memory nodes, we need to walk over the cpuset hierarchy, - * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then move the tasks in the empty - * cpuset to its next-highest non-empty parent. - */ -static void remove_tasks_in_empty_cpuset(struct cpuset *cs) -{ - struct cpuset *parent; - - /* - * Find its next-highest non-empty parent, (top cpuset - * has online cpus, so can't be empty). - */ - parent = parent_cs(cs); - while (cpumask_empty(parent->cpus_allowed) || - nodes_empty(parent->mems_allowed)) - parent = parent_cs(parent); - - if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - pr_err("cpuset: failed to transfer tasks out of empty cpuset "); - pr_cont_cgroup_name(cs->css.cgroup); - pr_cont("\n"); + have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); + if (have_boot_isolcpus) { + BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); + cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); } -} - -static void -hotplug_update_tasks_legacy(struct cpuset *cs, - struct cpumask *new_cpus, nodemask_t *new_mems, - bool cpus_updated, bool mems_updated) -{ - bool is_empty; - - spin_lock_irq(&callback_lock); - cpumask_copy(cs->cpus_allowed, new_cpus); - cpumask_copy(cs->effective_cpus, new_cpus); - cs->mems_allowed = *new_mems; - cs->effective_mems = *new_mems; - spin_unlock_irq(&callback_lock); - - /* - * Don't call update_tasks_cpumask() if the cpuset becomes empty, - * as the tasks will be migratecd to an ancestor. - */ - if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs); - if (mems_updated && !nodes_empty(cs->mems_allowed)) - update_tasks_nodemask(cs); - - is_empty = cpumask_empty(cs->cpus_allowed) || - nodes_empty(cs->mems_allowed); - - mutex_unlock(&cpuset_mutex); - - /* - * Move tasks to the nearest ancestor with execution resources, - * This is full cgroup operation which will also call back into - * cpuset. Should be done outside any lock. - */ - if (is_empty) - remove_tasks_in_empty_cpuset(cs); - mutex_lock(&cpuset_mutex); + return 0; } static void @@ -2939,7 +3907,8 @@ hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { - if (cpumask_empty(new_cpus)) + /* A partition root is allowed to have empty effective cpus */ + if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; @@ -2950,16 +3919,14 @@ hotplug_update_tasks(struct cpuset *cs, spin_unlock_irq(&callback_lock); if (cpus_updated) - update_tasks_cpumask(cs); + cpuset_update_tasks_cpumask(cs, new_cpus); if (mems_updated) - update_tasks_nodemask(cs); + cpuset_update_tasks_nodemask(cs); } -static bool force_rebuild; - void cpuset_force_rebuild(void) { - force_rebuild = true; + force_sd_rebuild = true; } /** @@ -2977,6 +3944,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) static nodemask_t new_mems; bool cpus_updated; bool mems_updated; + bool remote; + int partcmd = -1; struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -2992,75 +3961,77 @@ retry: goto retry; } - parent = parent_cs(cs); + parent = parent_cs(cs); compute_effective_cpumask(&new_cpus, cs, parent); nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); - if (cs->nr_subparts_cpus) - /* - * Make sure that CPUs allocated to child partitions - * do not show up in effective_cpus. - */ - cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus); - if (!tmp || !cs->partition_root_state) goto update_tasks; /* - * In the unlikely event that a partition root has empty - * effective_cpus or its parent becomes erroneous, we have to - * transition it to the erroneous state. + * Compute effective_cpus for valid partition root, may invalidate + * child partition roots if necessary. */ - if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || - (parent->partition_root_state == PRS_ERROR))) { - if (cs->nr_subparts_cpus) { - cs->nr_subparts_cpus = 0; - cpumask_clear(cs->subparts_cpus); - compute_effective_cpumask(&new_cpus, cs, parent); - } - - /* - * If the effective_cpus is empty because the child - * partitions take away all the CPUs, we can keep - * the current partition and let the child partitions - * fight for available CPUs. - */ - if ((parent->partition_root_state == PRS_ERROR) || - cpumask_empty(&new_cpus)) { - update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, tmp); - cs->partition_root_state = PRS_ERROR; - } - cpuset_force_rebuild(); + remote = is_remote_partition(cs); + if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) + compute_partition_effective_cpumask(cs, &new_cpus); + + if (remote && cpumask_empty(&new_cpus) && + partition_is_populated(cs, NULL)) { + cs->prs_err = PERR_HOTPLUG; + remote_partition_disable(cs, tmp); + compute_effective_cpumask(&new_cpus, cs, parent); + remote = false; } /* - * On the other hand, an erroneous partition root may be transitioned - * back to a regular one or a partition root with no CPU allocated - * from the parent may change to erroneous. + * Force the partition to become invalid if either one of + * the following conditions hold: + * 1) empty effective cpus but not valid empty partition. + * 2) parent is invalid or doesn't grant any cpus to child + * partitions. */ - if (is_partition_root(parent) && - ((cs->partition_root_state == PRS_ERROR) || - !cpumask_intersects(&new_cpus, parent->subparts_cpus)) && - update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp)) - cpuset_force_rebuild(); + if (is_local_partition(cs) && (!is_partition_valid(parent) || + tasks_nocpu_error(parent, cs, &new_cpus))) + partcmd = partcmd_invalidate; + /* + * On the other hand, an invalid partition root may be transitioned + * back to a regular one with a non-empty effective xcpus. + */ + else if (is_partition_valid(parent) && is_partition_invalid(cs) && + !cpumask_empty(cs->effective_xcpus)) + partcmd = partcmd_update; + + if (partcmd >= 0) { + update_parent_effective_cpumask(cs, partcmd, NULL, tmp); + if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { + compute_partition_effective_cpumask(cs, &new_cpus); + cpuset_force_rebuild(); + } + } update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); + if (!cpus_updated && !mems_updated) + goto unlock; /* Hotplug doesn't affect this cpuset */ + + if (mems_updated) + check_insane_mems_config(&new_mems); if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else - hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); +unlock: mutex_unlock(&cpuset_mutex); } /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -3074,8 +4045,10 @@ update_tasks: * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. + * + * CPU / memory hotplug is handled synchronously. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_handle_hotplug(void) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -3083,9 +4056,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) bool on_dfl = is_in_v2_mode(); struct tmpmasks tmp, *ptmp = NULL; - if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + if (on_dfl && !alloc_tmpmasks(&tmp)) ptmp = &tmp; + lockdep_assert_cpus_held(); mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ @@ -3093,32 +4067,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work) new_mems = node_states[N_MEMORY]; /* - * If subparts_cpus is populated, it is likely that the check below - * will produce a false positive on cpus_updated when the cpu list - * isn't changed. It is extra work, but it is better to be safe. + * If subpartitions_cpus is populated, it is likely that the check + * below will produce a false positive on cpus_updated when the cpu + * list isn't changed. It is extra work, but it is better to be safe. */ - cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); + cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) || + !cpumask_empty(subpartitions_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); - /* synchronize cpus_allowed to cpu_active_mask */ + /* For v1, synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { + cpuset_force_rebuild(); spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); /* * Make sure that CPUs allocated to child partitions * do not show up in effective_cpus. If no CPU is left, - * we clear the subparts_cpus & let the child partitions + * we clear the subpartitions_cpus & let the child partitions * fight for the CPUs again. */ - if (top_cpuset.nr_subparts_cpus) { - if (cpumask_subset(&new_cpus, - top_cpuset.subparts_cpus)) { - top_cpuset.nr_subparts_cpus = 0; - cpumask_clear(top_cpuset.subparts_cpus); + if (!cpumask_empty(subpartitions_cpus)) { + if (cpumask_subset(&new_cpus, subpartitions_cpus)) { + cpumask_clear(subpartitions_cpus); } else { cpumask_andnot(&new_cpus, &new_cpus, - top_cpuset.subparts_cpus); + subpartitions_cpus); } } cpumask_copy(top_cpuset.effective_cpus, &new_cpus); @@ -3133,7 +4107,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; spin_unlock_irq(&callback_lock); - update_tasks_nodemask(&top_cpuset); + cpuset_update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); @@ -3157,13 +4131,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rcu_read_unlock(); } - /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; - rebuild_sched_domains(); - } + /* rebuild sched domains if necessary */ + if (force_sd_rebuild) + rebuild_sched_domains_cpuslocked(); - free_cpumasks(NULL, ptmp); + free_tmpmasks(ptmp); } void cpuset_update_active_cpus(void) @@ -3173,12 +4145,7 @@ void cpuset_update_active_cpus(void) * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); } /* @@ -3189,15 +4156,10 @@ void cpuset_wait_for_hotplug(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - schedule_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); return NOTIFY_OK; } -static struct notifier_block cpuset_track_online_nodes_nb = { - .notifier_call = cpuset_track_online_nodes, - .priority = 10, /* ??! */ -}; - /** * cpuset_init_smp - initialize cpus_allowed * @@ -3205,28 +4167,73 @@ static struct notifier_block cpuset_track_online_nodes_nb = { */ void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - top_cpuset.mems_allowed = node_states[N_MEMORY]; + /* + * cpus_allowd/mems_allowed set to v2 values in the initial + * cpuset_bind() call will be reset to v1 values in another + * cpuset_bind() call when v1 cpuset is mounted. + */ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); } +/* + * Return cpus_allowed mask from a task's cpuset. + */ +static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) +{ + struct cpuset *cs; + + cs = task_cs(tsk); + if (cs != &top_cpuset) + guarantee_active_cpus(tsk, pmask); + /* + * Tasks in the top cpuset won't get update to their cpumasks + * when a hotplug online/offline event happens. So we include all + * offline cpus in the allowed cpu list. + */ + if ((cs == &top_cpuset) || cpumask_empty(pmask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + + /* + * We first exclude cpus allocated to partitions. If there is no + * allowable online cpu left, we fall back to all possible cpus. + */ + cpumask_andnot(pmask, possible_mask, subpartitions_cpus); + if (!cpumask_intersects(pmask, cpu_active_mask)) + cpumask_copy(pmask, possible_mask); + } +} + +/** + * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Similir to cpuset_cpus_allowed() except that the caller must have acquired + * cpuset_mutex. + */ +void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) +{ + lockdep_assert_held(&cpuset_mutex); + __cpuset_cpus_allowed_locked(tsk, pmask); +} + /** - * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. + * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_mask, even if this means going outside the - * tasks cpuset. + * subset of cpu_active_mask, even if this means going outside the + * tasks cpuset, except when the task is in the top cpuset. **/ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) @@ -3234,16 +4241,36 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); - guarantee_online_cpus(task_cs(tsk), pmask); - rcu_read_unlock(); + __cpuset_cpus_allowed_locked(tsk, pmask); spin_unlock_irqrestore(&callback_lock, flags); } -void cpuset_cpus_allowed_fallback(struct task_struct *tsk) +/** + * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. + * @tsk: pointer to task_struct with which the scheduler is struggling + * + * Description: In the case that the scheduler cannot find an allowed cpu in + * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy + * mode however, this value is the same as task_cs(tsk)->effective_cpus, + * which will not contain a sane cpumask during cases such as cpu hotplugging. + * This is the absolute last resort for the scheduler and it is only used if + * _every_ other avenue has been traveled. + * + * Returns true if the affinity of @tsk was changed, false otherwise. + **/ + +bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + const struct cpumask *cs_mask; + bool changed = false; + rcu_read_lock(); - do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); + cs_mask = task_cs(tsk)->cpus_allowed; + if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { + set_cpus_allowed_force(tsk, cs_mask); + changed = true; + } rcu_read_unlock(); /* @@ -3263,6 +4290,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) * select_fallback_rq() will fix things ups and set cpu_possible_mask * if required. */ + return changed; } void __init cpuset_init_current_mems_allowed(void) @@ -3286,16 +4314,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return mask; } /** - * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed + * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed * @nodemask: the nodemask to be checked * * Are any of the nodes in the nodemask allowed in current->mems_allowed? @@ -3318,8 +4344,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) return cs; } -/** - * cpuset_node_allowed - Can we allocate on a memory node? +/* + * cpuset_current_node_allowed - Can current task allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -3358,10 +4384,10 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -bool __cpuset_node_allowed(int node, gfp_t gfp_mask) +bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ - int allowed; /* is allocation in zone z allowed? */ + bool allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) @@ -3383,18 +4409,52 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return allowed; } +bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +{ + struct cgroup_subsys_state *css; + struct cpuset *cs; + bool allowed; + + /* + * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy + * and mems_allowed is likely to be empty even if we could get to it, + * so return true to avoid taking a global lock on the empty check. + */ + if (!cpuset_v2()) + return true; + + css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); + if (!css) + return true; + + /* + * Normally, accessing effective_mems would require the cpuset_mutex + * or callback_lock - but node_isset is atomic and the reference + * taken via cgroup_get_e_css is sufficient to protect css. + * + * Since this interface is intended for use by migration paths, we + * relax locking here to avoid taking global locks - while accepting + * there may be rare scenarios where the result may be innaccurate. + * + * Reclaim and migration are subject to these same race conditions, and + * cannot make strong isolation guarantees, so this is acceptable. + */ + cs = container_of(css, struct cpuset, css); + allowed = node_isset(nid, cs->effective_mems); + css_put(css); + return allowed; +} + /** - * cpuset_mem_spread_node() - On which node to begin search for a file page - * cpuset_slab_spread_node() - On which node to begin search for a slab page + * cpuset_spread_node() - On which node to begin search for a page + * @rotor: round robin rotor * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), @@ -3418,12 +4478,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) * is passed an offline node, it will fall back to the local node. * See kmem_cache_alloc_node(). */ - static int cpuset_spread_node(int *rotor) { return *rotor = next_node_in(*rotor, current->mems_allowed); } +/** + * cpuset_mem_spread_node() - On which node to begin search for a file page + */ int cpuset_mem_spread_node(void) { if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) @@ -3433,17 +4495,6 @@ int cpuset_mem_spread_node(void) return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); } -int cpuset_slab_spread_node(void) -{ - if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) - current->cpuset_slab_spread_rotor = - node_random(¤t->mems_allowed); - - return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); -} - -EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); - /** * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? * @tsk1: pointer to task_struct of some task. @@ -3482,79 +4533,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -/* - * Collection of memory_pressure is suppressed unless - * this flag is enabled by writing "1" to the special - * cpuset file 'memory_pressure_enabled' in the root cpuset. - */ - -int cpuset_memory_pressure_enabled __read_mostly; - -/** - * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. - * - * Keep a running average of the rate of synchronous (direct) - * page reclaim efforts initiated by tasks in each cpuset. - * - * This represents the rate at which some task in the cpuset - * ran low on memory on all nodes it was allowed to use, and - * had to enter the kernels page reclaim code in an effort to - * create more free memory by tossing clean pages or swapping - * or writing dirty pages. - * - * Display to user space in the per-cpuset read-only file - * "memory_pressure". Value displayed is an integer - * representing the recent rate of entry into the synchronous - * (direct) page reclaim by any task attached to the cpuset. - **/ - -void __cpuset_memory_pressure_bump(void) -{ - rcu_read_lock(); - fmeter_markevent(&task_cs(current)->fmeter); - rcu_read_unlock(); -} - -#ifdef CONFIG_PROC_PID_CPUSET -/* - * proc_cpuset_show() - * - Print tasks cpuset path into seq_file. - * - Used for /proc/<pid>/cpuset. - * - No need to task_lock(tsk) on this tsk->cpuset reference, as it - * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it - * anyway. - */ -int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - char *buf; - struct cgroup_subsys_state *css; - int retval; - - retval = -ENOMEM; - buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!buf) - goto out; - - css = task_get_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); - if (retval >= PATH_MAX) - retval = -ENAMETOOLONG; - if (retval < 0) - goto out_free; - seq_puts(m, buf); - seq_putc(m, '\n'); - retval = 0; -out_free: - kfree(buf); -out: - return retval; -} -#endif /* CONFIG_PROC_PID_CPUSET */ - /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { |
