From 8f89140ae41ccd9c63344e6823faa862aa7435e3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:10 -0700 Subject: cgroup: minor updates around cgroup_clear_directory() * Rename it to cgroup_clear_dir() and make it take the pointer to the target cgroup instead of the the dentry. This makes the function consistent with its counterpart - cgroup_populate_dir(). * Move cgroup_clear_directory() invocation from cgroup_d_remove_dir() to cgroup_remount() so that the function doesn't have to determine the cgroup pointer back from the dentry. cgroup_d_remove_dir() now only deals with vfs, which is slightly cleaner. This patch doesn't introduce any functional differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5583d10a325..09bfa870e698 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -957,15 +957,14 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_directory - selective removal of base and subsystem files - * @dir: directory containing the files + * cgroup_clear_dir - selective removal of base and subsystem files + * @cgrp: target cgroup * @base_files: true if the base files should be removed * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_directory(struct dentry *dir, bool base_files, - unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, + unsigned long subsys_mask) { - struct cgroup *cgrp = __d_cgrp(dir); struct cgroup_subsys *ss; for_each_root_subsys(cgrp->root, ss) { @@ -987,9 +986,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, static void cgroup_d_remove_dir(struct dentry *dentry) { struct dentry *parent; - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - - cgroup_clear_directory(dentry, true, root->subsys_mask); parent = dentry->d_parent; spin_lock(&parent->d_lock); @@ -1376,7 +1372,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) * this before rebind_subsystems, since rebind_subsystems may * change this hierarchy's subsys_list. */ - cgroup_clear_directory(cgrp->dentry, false, removed_mask); + cgroup_clear_dir(cgrp, false, removed_mask); ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) { @@ -4541,9 +4537,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) raw_spin_unlock(&release_list_lock); /* - * Remove @cgrp directory. The removal puts the base ref but we - * aren't quite done with @cgrp yet, so hold onto it. + * Clear and remove @cgrp directory. The removal puts the base ref + * but we aren't quite done with @cgrp yet, so hold onto it. */ + cgroup_clear_dir(cgrp, true, cgrp->root->subsys_mask); dget(d); cgroup_d_remove_dir(d); -- cgit From b1f28d3109349899e87377e89f9d8ab5bc95ec57 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:10 -0700 Subject: cgroup: fix error path of cgroup_addrm_files() cgroup_addrm_files() mishandled error return value from cgroup_add_file() and returns error iff the last file fails to create. As we're in the process of cleaning up file add/rm error handling and will reliably propagate file creation failures, there's no point in keeping adding files after a failure. Replace the broken error collection logic with immediate error return. While at it, add lockdep assertions and function comment. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 09bfa870e698..9b16d75bec63 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2780,11 +2780,26 @@ out: return error; } +/** + * cgroup_addrm_files - add or remove files to a cgroup directory + * @cgrp: the target cgroup + * @subsys: the subsystem of files to be added + * @cfts: array of cftypes to be added + * @is_add: whether to add or remove + * + * Depending on @is_add, add or remove files defined by @cfts on @cgrp. + * All @cfts should belong to @subsys. For removals, this function never + * fails. If addition fails, this function doesn't remove files already + * added. The caller is responsible for cleaning up. + */ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add) { struct cftype *cft; - int err, ret = 0; + int ret; + + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2796,16 +2811,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, continue; if (is_add) { - err = cgroup_add_file(cgrp, subsys, cft); - if (err) + ret = cgroup_add_file(cgrp, subsys, cft); + if (ret) { pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", - cft->name, err); - ret = err; + cft->name, ret); + return ret; + } } else { cgroup_rm_file(cgrp, cft); } } - return ret; + return 0; } static void cgroup_cfts_prepare(void) -- cgit From 9ccece80ae19ed42439fc0ced76858f189cd41e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: fix cgroup_add_cftypes() error handling cgroup_add_cftypes() uses cgroup_cfts_commit() to actually create the files; however, both functions ignore actual file creation errors and just assume success. This can lead to, for example, blkio hierarchy with some of the cgroups with only subset of interface files populated after cfq-iosched is loaded under heavy memory pressure, which is nasty. This patch updates cgroup_cfts_commit() and cgroup_add_cftypes() to guarantee that all files are created on success and no file is created on failure. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9b16d75bec63..36c0ccc921f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2836,8 +2836,8 @@ static void cgroup_cfts_prepare(void) mutex_lock(&cgroup_mutex); } -static void cgroup_cfts_commit(struct cgroup_subsys *ss, - struct cftype *cfts, bool is_add) +static int cgroup_cfts_commit(struct cgroup_subsys *ss, + struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) { LIST_HEAD(pending); @@ -2846,12 +2846,13 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, struct dentry *prev = NULL; struct inode *inode; u64 update_before; + int ret = 0; /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ if (!cfts || ss->root == &cgroup_dummy_root || !atomic_inc_not_zero(&sb->s_active)) { mutex_unlock(&cgroup_mutex); - return; + return 0; } /* @@ -2867,10 +2868,13 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, inode = root->dentry->d_inode; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - cgroup_addrm_files(root, ss, cfts, is_add); + ret = cgroup_addrm_files(root, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); + if (ret) + goto out_deact; + /* add/rm files for all cgroups created before */ rcu_read_lock(); cgroup_for_each_descendant_pre(cgrp, root) { @@ -2887,15 +2891,19 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - cgroup_addrm_files(cgrp, ss, cfts, is_add); + ret = cgroup_addrm_files(cgrp, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); rcu_read_lock(); + if (ret) + break; } rcu_read_unlock(); dput(prev); +out_deact: deactivate_super(sb); + return ret; } /** @@ -2915,6 +2923,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; + int ret; set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) @@ -2923,9 +2932,10 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) cgroup_cfts_prepare(); set->cfts = cfts; list_add_tail(&set->node, &ss->cftsets); - cgroup_cfts_commit(ss, cfts, true); - - return 0; + ret = cgroup_cfts_commit(ss, cfts, true); + if (ret) + cgroup_rm_cftypes(ss, cfts); + return ret; } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); -- cgit From 628f7cd47ab758cae0353d1a6decf3d1459dca24 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: separate out cgroup_base_files[] handling out of cgroup_populate/clear_dir() cgroup_populate/clear_dir() currently take @base_files and adds and removes, respectively, cgroup_base_files[] to the directory. File additions and removals are being reorganized for proper error handling and more dynamic handling for the unified hierarchy, and mixing base and subsys file handling into the same functions gets a bit confusing. This patch moves base file handling out of cgroup_populate/clear_dir() into their users - cgroup_mount(), cgroup_create() and cgroup_destroy_locked(). Note that this changes the behavior of base file removal. If @base_files is %true, cgroup_clear_dir() used to delete files regardless of cftype until there's no files left. Now, only files with matching cfts are removed. As files can only be created by the base or registered cftypes, this shouldn't result in any behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 36c0ccc921f4..9835a097f3c0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -215,6 +215,8 @@ static u64 cgroup_serial_nr_next = 1; */ static int need_forkexit_callback __read_mostly; +static struct cftype cgroup_base_files[]; + static void cgroup_offline_fn(struct work_struct *work); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, @@ -804,8 +806,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static const struct inode_operations cgroup_dir_inode_operations; static const struct file_operations proc_cgroupstats_operations; @@ -957,13 +958,11 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_dir - selective removal of base and subsystem files + * cgroup_clear_dir - remove subsys files in a cgroup directory * @cgrp: target cgroup - * @base_files: true if the base files should be removed * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; @@ -974,10 +973,6 @@ static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, list_for_each_entry(set, &ss->cftsets, node) cgroup_addrm_files(cgrp, NULL, set->cfts, false); } - if (base_files) { - while (!list_empty(&cgrp->files)) - cgroup_rm_file(cgrp, NULL); - } } /* @@ -1372,17 +1367,17 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) * this before rebind_subsystems, since rebind_subsystems may * change this hierarchy's subsys_list. */ - cgroup_clear_dir(cgrp, false, removed_mask); + cgroup_clear_dir(cgrp, removed_mask); ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) { /* rebind_subsystems failed, re-populate the removed files */ - cgroup_populate_dir(cgrp, false, removed_mask); + cgroup_populate_dir(cgrp, removed_mask); goto out_unlock; } /* re-populate subsystem files */ - cgroup_populate_dir(cgrp, false, added_mask); + cgroup_populate_dir(cgrp, added_mask); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1687,7 +1682,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cred = override_creds(&init_cred); - cgroup_populate_dir(root_cgrp, true, root->subsys_mask); + cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + cgroup_populate_dir(root_cgrp, root->subsys_mask); revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); @@ -4172,23 +4168,14 @@ static struct cftype cgroup_base_files[] = { }; /** - * cgroup_populate_dir - selectively creation of files in a directory + * cgroup_populate_dir - create subsys files in a cgroup directory * @cgrp: target cgroup - * @base_files: true if the base files should be added * @subsys_mask: mask of the subsystem ids whose files should be added */ -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { - int err; struct cgroup_subsys *ss; - if (base_files) { - err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); - if (err < 0) - return err; - } - /* process cftsets of each subsystem */ for_each_root_subsys(cgrp->root, ss) { struct cftype_set *set; @@ -4410,7 +4397,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } - err = cgroup_populate_dir(cgrp, true, root->subsys_mask); + err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); + if (err) + goto err_destroy; + + err = cgroup_populate_dir(cgrp, root->subsys_mask); if (err) goto err_destroy; @@ -4566,7 +4557,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * Clear and remove @cgrp directory. The removal puts the base ref * but we aren't quite done with @cgrp yet, so hold onto it. */ - cgroup_clear_dir(cgrp, true, cgrp->root->subsys_mask); + cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); + cgroup_addrm_files(cgrp, NULL, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); -- cgit From bee550994f6b0c1179bd3ccea58dc5c2c4ccf842 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: update error handling in cgroup_populate_dir() cgroup_populate_dir() didn't use to check whether the actual file creations were successful and could return success with only subset of the requested files created, which is nasty. This patch udpates cgroup_populate_dir() so that it either succeeds with all files or fails with no file. v2: The original patch also converted for_each_root_subsys() usages to for_each_subsys() without explaining why. That part has been moved to a separate patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9835a097f3c0..6b7324431b99 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4171,10 +4171,13 @@ static struct cftype cgroup_base_files[] = { * cgroup_populate_dir - create subsys files in a cgroup directory * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be added + * + * On failure, no file is added. */ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; + int ret = 0; /* process cftsets of each subsystem */ for_each_root_subsys(cgrp->root, ss) { @@ -4182,8 +4185,11 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) if (!test_bit(ss->subsys_id, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, ss, set->cfts, true); + list_for_each_entry(set, &ss->cftsets, node) { + ret = cgroup_addrm_files(cgrp, ss, set->cfts, true); + if (ret < 0) + goto err; + } } /* This cgroup is ready now */ @@ -4201,6 +4207,9 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) } return 0; +err: + cgroup_clear_dir(cgrp, subsys_mask); + return ret; } static void css_dput_fn(struct work_struct *work) -- cgit From b420ba7db15659253d4f286a0ba479d336371999 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 12 Jul 2013 12:34:02 -0700 Subject: cgroup: use for_each_subsys() instead of for_each_root_subsys() in cgroup_populate/clear_dir() rebind_subsystems() will be updated to handle file creations and removals with proper error handling and to do that will need to perform file operations before actually adding the subsystem to the hierarchy. To enable such usage, update cgroup_populate/clear_dir() to use for_each_subsys() instead of for_each_root_subsys() so that they operate on all subsystems specified by @subsys_mask whether that subsystem is currently bound to the hierarchy or not. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6b7324431b99..8f70dc0c0c79 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -965,10 +965,12 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; + int i; - for_each_root_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) cgroup_addrm_files(cgrp, NULL, set->cfts, false); @@ -4177,12 +4179,13 @@ static struct cftype cgroup_base_files[] = { static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; - int ret = 0; + int i, ret = 0; /* process cftsets of each subsystem */ - for_each_root_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) { -- cgit From 3126121fb30941552b1a806c7c2e686bde57e270 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 17:07:30 -0700 Subject: cgroup: make rebind_subsystems() handle file additions and removals with proper error handling Currently, creating and removing cgroup files in the root directory are handled separately from the actual subsystem binding and unbinding which happens in rebind_subsystems(). Also, rebind_subsystems() users aren't handling file creation errors properly. Let's integrate top_cgroup file handling into rebind_subsystems() so that it's simpler to use and everyone handles file creation errors correctly. * On a successful return, rebind_subsystems() is guaranteed to have created all files of the new subsystems and deleted the ones belonging to the removed subsystems. After a failure, no file is created or removed. * cgroup_remount() no longer needs to make explicit populate/clear calls as it's all handled by rebind_subsystems(), and it gets proper error handling automatically. * cgroup_mount() has been updated such that the root dentry and cgroup are linked before rebind_subsystems(). Also, the init_cred dancing and base file handling are moved right above rebind_subsystems() call and proper error handling for the base files is added. While at it, add a comment explaining what's going on with the cred thing. * cgroup_kill_sb() calls rebind_subsystems() to unbind all subsystems which now implies removing all subsystem files which requires the directory's i_mutex. Grab it. This means that files on the root cgroup are removed earlier - they used to be deleted from generic super_block cleanup from vfs. This doesn't lead to any functional difference and it's cleaner to do the clean up explicitly for all files. Combined with the previous changes, this makes all cgroup file creation errors handled correctly. v2: Added comment on init_cred. v3: Li spotted that cgroup_mount() wasn't freeing tmp_links after base file addition failure. Fix it by adding free_tmp_links error handling label. v4: v3 introduced build bugs which got noticed by Fengguang's awesome kbuild test robot. Fixed, and shame on me. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Fengguang Wu --- kernel/cgroup.c | 73 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8f70dc0c0c79..4ec8d2da94d1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1003,7 +1003,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; - int i; + int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); @@ -1028,7 +1028,16 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (root->number_of_cgroups > 1) return -EBUSY; - /* Process each subsystem */ + ret = cgroup_populate_dir(cgrp, added_mask); + if (ret) + return ret; + + /* + * Nothing can fail from this point on. Remove files for the + * removed subsystems and rebind each subsystem. + */ + cgroup_clear_dir(cgrp, removed_mask); + for_each_subsys(ss, i) { unsigned long bit = 1UL << i; @@ -1364,22 +1373,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } - /* - * Clear out the files of subsystems that should be removed, do - * this before rebind_subsystems, since rebind_subsystems may - * change this hierarchy's subsys_list. - */ - cgroup_clear_dir(cgrp, removed_mask); - ret = rebind_subsystems(root, added_mask, removed_mask); - if (ret) { - /* rebind_subsystems failed, re-populate the removed files */ - cgroup_populate_dir(cgrp, removed_mask); + if (ret) goto out_unlock; - } - - /* re-populate subsystem files */ - cgroup_populate_dir(cgrp, added_mask); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1578,7 +1574,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int ret = 0; struct super_block *sb; struct cgroupfs_root *new_root; + struct list_head tmp_links; struct inode *inode; + const struct cred *cred; /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); @@ -1610,10 +1608,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!root); if (root == opts.new_root) { /* We used the new root structure, so this is a new hierarchy */ - struct list_head tmp_links; struct cgroup *root_cgrp = &root->top_cgroup; struct cgroupfs_root *existing_root; - const struct cred *cred; int i; struct css_set *cset; @@ -1651,26 +1647,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; + sb->s_root->d_fsdata = root_cgrp; + root_cgrp->dentry = sb->s_root; + + /* + * We're inside get_sb() and will call lookup_one_len() to + * create the root files, which doesn't work if SELinux is + * in use. The following cred dancing somehow works around + * it. See 2ce9738ba ("cgroupfs: use init_cred when + * populating new cgroupfs mount") for more details. + */ + cred = override_creds(&init_cred); + + ret = cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + if (ret) + goto rm_base_files; + ret = rebind_subsystems(root, root->subsys_mask, 0); - if (ret == -EBUSY) { - free_cgrp_cset_links(&tmp_links); - goto unlock_drop; - } + if (ret) + goto rm_base_files; + + revert_creds(cred); + /* * There must be no failure case after here, since rebinding * takes care of subsystems' refcounts, which are explicitly * dropped in the failure exit path. */ - /* EBUSY should be the only error here */ - BUG_ON(ret); - list_add(&root->root_list, &cgroup_roots); cgroup_root_count++; - sb->s_root->d_fsdata = root_cgrp; - root->top_cgroup.dentry = sb->s_root; - /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); @@ -1683,10 +1690,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - cred = override_creds(&init_cred); - cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); - cgroup_populate_dir(root_cgrp, root->subsys_mask); - revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -1715,6 +1718,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, kfree(opts.name); return dget(sb->s_root); + rm_base_files: + free_cgrp_cset_links(&tmp_links); + cgroup_addrm_files(&root->top_cgroup, NULL, cgroup_base_files, false); + revert_creds(cred); unlock_drop: cgroup_exit_root_id(root); mutex_unlock(&cgroup_root_mutex); @@ -1741,6 +1748,7 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); + mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1773,6 +1781,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); simple_xattrs_free(&cgrp->xattrs); -- cgit From f172e67cf9d842bc646d0f66792e38435a334b1e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 17:07:30 -0700 Subject: cgroup: move number_of_cgroups test out of rebind_subsystems() into cgroup_remount() rebind_subsystems() currently fails if the hierarchy has any !root cgroups; however, on the planned unified hierarchy, rebind_subsystems() will be used while populated. Move the test to cgroup_remount(), which is the only place the test is necessary anyway. As it's impossible for the other two callers of rebind_subsystems() to have populated hierarchy, this doesn't make any behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4ec8d2da94d1..c108d3d1ea30 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1021,13 +1021,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } - /* Currently we don't handle adding/removing subsystems when - * any child cgroups exist. This is theoretically supportable - * but involves complex error handling, so it's being left until - * later */ - if (root->number_of_cgroups > 1) - return -EBUSY; - ret = cgroup_populate_dir(cgrp, added_mask); if (ret) return ret; @@ -1373,6 +1366,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* remounting is not allowed for populated hierarchies */ + if (root->number_of_cgroups > 1) { + ret = -EBUSY; + goto out_unlock; + } + ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) goto out_unlock; -- cgit From 1d5be6b287c8efc879fbe578e2b7bc8f7a38f313 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 12 Jul 2013 13:38:17 -0700 Subject: cgroup: move module ref handling into rebind_subsystems() Module ref handling in cgroup is rather weird. parse_cgroupfs_options() grabs all the modules for the specified subsystems. A module ref is kept if the specified subsystem is newly bound to the hierarchy. If not, or the operation fails, the refs are dropped. This scatters module ref handling across multiple functions making it difficult to track. It also make the function nasty to use for dynamic subsystem binding which is necessary for the planned unified hierarchy. There's nothing which requires the subsystem modules to be pinned between parse_cgroupfs_options() and rebind_subsystems() in both mount and remount paths. parse_cgroupfs_options() can just parse and rebind_subsystems() can handle pinning the subsystems that it wants to bind, which is a natural part of its task - binding - anyway. Move module ref handling into rebind_subsystems() which makes the code a lot simpler - modules are gotten iff it's gonna be bound and put iff unbound or binding fails. v2: Li pointed out that if a controller module is unloaded between parsing and binding, rebind_subsystems() won't notice the missing controller as it only iterates through existing controllers. Fix it by updating rebind_subsystems() to compare @added_mask to @pinned and fail with -ENOENT if they don't match. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 93 +++++++++++++++++---------------------------------------- 1 file changed, 28 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c108d3d1ea30..2a8cf1a7d2f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1003,6 +1003,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; + unsigned long pinned = 0; int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -1010,20 +1011,32 @@ static int rebind_subsystems(struct cgroupfs_root *root, /* Check that any added subsystems are currently free */ for_each_subsys(ss, i) { - unsigned long bit = 1UL << i; - - if (!(bit & added_mask)) + if (!(added_mask & (1 << i))) continue; + /* is the subsystem mounted elsewhere? */ if (ss->root != &cgroup_dummy_root) { - /* Subsystem isn't free */ - return -EBUSY; + ret = -EBUSY; + goto out_put; + } + + /* pin the module */ + if (!try_module_get(ss->module)) { + ret = -ENOENT; + goto out_put; } + pinned |= 1 << i; + } + + /* subsys could be missing if unloaded between parsing and here */ + if (added_mask != pinned) { + ret = -ENOENT; + goto out_put; } ret = cgroup_populate_dir(cgrp, added_mask); if (ret) - return ret; + goto out_put; /* * Nothing can fail from this point on. Remove files for the @@ -1067,11 +1080,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } else if (bit & root->subsys_mask) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); - /* - * a refcount was taken, but we already had one, so - * drop the extra reference. - */ - module_put(ss->module); #ifdef CONFIG_MODULE_UNLOAD BUG_ON(ss->module && !module_refcount(ss->module)); #endif @@ -1088,6 +1096,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, root->flags |= CGRP_ROOT_SUBSYS_BOUND; return 0; + +out_put: + for_each_subsys(ss, i) + if (pinned & (1 << i)) + module_put(ss->module); + return ret; } static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) @@ -1138,7 +1152,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) char *token, *o = data; bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; - bool module_pin_failed = false; struct cgroup_subsys *ss; int i; @@ -1281,52 +1294,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (!opts->subsys_mask && !opts->name) return -EINVAL; - /* - * Grab references on all the modules we'll need, so the subsystems - * don't dance around before rebind_subsystems attaches them. This may - * take duplicate reference counts on a subsystem that's already used, - * but rebind_subsystems handles this case. - */ - for_each_subsys(ss, i) { - if (!(opts->subsys_mask & (1UL << i))) - continue; - if (!try_module_get(cgroup_subsys[i]->module)) { - module_pin_failed = true; - break; - } - } - if (module_pin_failed) { - /* - * oops, one of the modules was going away. this means that we - * raced with a module_delete call, and to the user this is - * essentially a "subsystem doesn't exist" case. - */ - for (i--; i >= 0; i--) { - /* drop refcounts only on the ones we took */ - unsigned long bit = 1UL << i; - - if (!(bit & opts->subsys_mask)) - continue; - module_put(cgroup_subsys[i]->module); - } - return -ENOENT; - } - return 0; } -static void drop_parsed_module_refcounts(unsigned long subsys_mask) -{ - struct cgroup_subsys *ss; - int i; - - mutex_lock(&cgroup_mutex); - for_each_subsys(ss, i) - if (subsys_mask & (1UL << i)) - module_put(cgroup_subsys[i]->module); - mutex_unlock(&cgroup_mutex); -} - static int cgroup_remount(struct super_block *sb, int *flags, char *data) { int ret = 0; @@ -1384,8 +1354,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - if (ret) - drop_parsed_module_refcounts(opts.subsys_mask); return ret; } @@ -1591,7 +1559,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, new_root = cgroup_root_from_opts(&opts); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - goto drop_modules; + goto out_err; } opts.new_root = new_root; @@ -1600,7 +1568,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_free_root(opts.new_root); - goto drop_modules; + goto out_err; } root = sb->s_fs_info; @@ -1708,9 +1676,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } - - /* no subsys rebinding, so refcounts don't change */ - drop_parsed_module_refcounts(opts.subsys_mask); } kfree(opts.release_agent); @@ -1728,8 +1693,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_unlock(&inode->i_mutex); drop_new_super: deactivate_locked_super(sb); - drop_modules: - drop_parsed_module_refcounts(opts.subsys_mask); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -4837,7 +4800,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) /* * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get in parse_cgroupfs_options should ensure that it + * try_module_get() in rebind_subsystems() should ensure that it * doesn't start being used while we're killing it off. */ BUG_ON(ss->root != &cgroup_dummy_root); -- cgit From a698b4488ab98deef6c3beeba3e27fea17650132 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 21:08:27 -0700 Subject: cgroup: remove gratuituous BUG_ON()s from rebind_subsystems() rebind_subsystems() performs santiy checks even on subsystems which aren't specified to be added or removed and the checks aren't all that useful given that these are in a very cold path while the violations they check would trip up in much hotter paths. Let's remove these from rebind_subsystems(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a8cf1a7d2f4..345fac8e4fba 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1077,15 +1077,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, /* subsystem is now free - drop reference on module */ module_put(ss->module); root->subsys_mask &= ~bit; - } else if (bit & root->subsys_mask) { - /* Subsystem state should already exist */ - BUG_ON(!cgrp->subsys[i]); -#ifdef CONFIG_MODULE_UNLOAD - BUG_ON(ss->module && !module_refcount(ss->module)); -#endif - } else { - /* Subsystem state shouldn't exist */ - BUG_ON(cgrp->subsys[i]); } } -- cgit From 9ad9d25a1ec82d6e52d687348e8cdd4942e7d393 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Sat, 27 Jul 2013 11:56:49 +0800 Subject: cpuset: get rid of the useless forward declaration of cpuset get rid of the useless forward declaration of the struct cpuset cause the below define it. Signed-off-by: Zhao Hongjiang Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e5657788fedd..2ddd9b93feaa 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -70,7 +70,6 @@ int number_of_cpusets __read_mostly; /* Forward declare cgroup structures */ struct cgroup_subsys cpuset_subsys; -struct cpuset; /* See "Frequency meter" comments, below. */ -- cgit From 0b9e6965add0701e5cbf56d5bab6d9181e948359 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Sat, 27 Jul 2013 11:56:53 +0800 Subject: cpuset: relocate a misplaced comment Comment for cpuset_css_offline() was on top of cpuset_css_free(). Move it. Signed-off-by: Zhao Hongjiang Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2ddd9b93feaa..703bfd5a32a9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2020,6 +2020,12 @@ out_unlock: return 0; } +/* + * If the cpuset being removed has its flag 'sched_load_balance' + * enabled, then simulate turning sched_load_balance off, which + * will call rebuild_sched_domains_locked(). + */ + static void cpuset_css_offline(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); @@ -2035,12 +2041,6 @@ static void cpuset_css_offline(struct cgroup *cgrp) mutex_unlock(&cpuset_mutex); } -/* - * If the cpuset being removed has its flag 'sched_load_balance' - * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). - */ - static void cpuset_css_free(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); -- cgit From 2a4ac63333584b2791986cf2270f5ba9a4b97606 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 16:16:40 +0800 Subject: cgroup: remove sparse tags from offline_css() This should have been removed in commit d7eeac1913ff ("cgroup: hold cgroup_mutex before calling css_offline"). While at it, update the comments. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 345fac8e4fba..41b559f51502 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4214,7 +4214,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, INIT_WORK(&css->dput_work, css_dput_fn); } -/* invoke ->post_create() on a new CSS and mark it online if successful */ +/* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { int ret = 0; @@ -4228,9 +4228,8 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return ret; } -/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ +/* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; -- cgit From e0798ce27346edb8aa369b5b39af5a47fdf2b25c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 17:36:25 +0800 Subject: cgroup: remove struct cgroup_seqfile_state We can use struct cfent instead. v2: - remove cgroup_seqfile_release(). Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 45 +++++++++++++-------------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 41b559f51502..ed2104304833 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2397,11 +2397,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, * supports string->u64 maps, but can be extended in future. */ -struct cgroup_seqfile_state { - struct cftype *cft; - struct cgroup *cgroup; -}; - static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) { struct seq_file *sf = cb->state; @@ -2410,59 +2405,45 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) static int cgroup_seqfile_show(struct seq_file *m, void *arg) { - struct cgroup_seqfile_state *state = m->private; - struct cftype *cft = state->cft; + struct cfent *cfe = m->private; + struct cftype *cft = cfe->type; + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + if (cft->read_map) { struct cgroup_map_cb cb = { .fill = cgroup_map_add, .state = m, }; - return cft->read_map(state->cgroup, cft, &cb); + return cft->read_map(cgrp, cft, &cb); } - return cft->read_seq_string(state->cgroup, cft, m); -} - -static int cgroup_seqfile_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - kfree(seq->private); - return single_release(inode, file); + return cft->read_seq_string(cgrp, cft, m); } static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, - .release = cgroup_seqfile_release, + .release = single_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) { int err; + struct cfent *cfe; struct cftype *cft; err = generic_file_open(inode, file); if (err) return err; - cft = __d_cft(file->f_dentry); + cfe = __d_cfe(file->f_dentry); + cft = cfe->type; if (cft->read_map || cft->read_seq_string) { - struct cgroup_seqfile_state *state; - - state = kzalloc(sizeof(*state), GFP_USER); - if (!state) - return -ENOMEM; - - state->cft = cft; - state->cgroup = __d_cgrp(file->f_dentry->d_parent); file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, state); - if (err < 0) - kfree(state); - } else if (cft->open) + err = single_open(file, cgroup_seqfile_show, cfe); + } else if (cft->open) { err = cft->open(inode, file); - else - err = 0; + } return err; } -- cgit From 6f4b7e632d78c2d91502211c430722cc66428492 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 16:18:36 +0800 Subject: cgroup: more naming cleanups Constantly use @cset for css_set variables and use @cgrp as cgroup variables. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 26 +++++++++++++------------- kernel/cpuset.c | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed2104304833..9577bebe2546 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -466,7 +466,7 @@ static inline void put_css_set_taskexit(struct css_set *cset) * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * - * Returns true if "cg" matches "old_cg" except for the hierarchy + * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ static bool compare_css_sets(struct css_set *cset, @@ -1839,7 +1839,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; - struct css_set *cg; + struct css_set *cset; }; struct cgroup_taskset { @@ -2057,8 +2057,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, tc = flex_array_get(group, i); old_cset = task_css_set(tc->task); - tc->cg = find_css_set(old_cset, cgrp); - if (!tc->cg) { + tc->cset = find_css_set(old_cset, cgrp); + if (!tc->cset) { retval = -ENOMEM; goto out_put_css_set_refs; } @@ -2071,7 +2071,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); + cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); } /* nothing is sensitive to fork() after this point. */ @@ -2091,9 +2091,9 @@ out_put_css_set_refs: if (retval) { for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - if (!tc->cg) + if (!tc->cset) break; - put_css_set(tc->cg); + put_css_set(tc->cset); } } out_cancel_attach: @@ -2203,9 +2203,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) mutex_lock(&cgroup_mutex); for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); + struct cgroup *from_cgrp = task_cgroup_from_root(from, root); - retval = cgroup_attach_task(from_cg, tsk, false); + retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } @@ -3305,8 +3305,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_iter_start(scan->cg, &it); - while ((p = cgroup_iter_next(scan->cg, &it))) { + cgroup_iter_start(scan->cgrp, &it); + while ((p = cgroup_iter_next(scan->cgrp, &it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3339,7 +3339,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_iter_end(scan->cg, &it); + cgroup_iter_end(scan->cgrp, &it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3385,7 +3385,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { struct cgroup_scanner scan; - scan.cg = from; + scan.cgrp = from; scan.test_task = NULL; /* select all tasks in cgroup */ scan.process_task = cgroup_transfer_one_task; scan.heap = NULL; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 703bfd5a32a9..1b9c31549797 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -845,7 +845,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, { struct cpuset *cpus_cs; - cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); + cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cgrp)); set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); } @@ -866,7 +866,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { struct cgroup_scanner scan; - scan.cg = cs->css.cgroup; + scan.cgrp = cs->css.cgroup; scan.test_task = NULL; scan.process_task = cpuset_change_cpumask; scan.heap = heap; @@ -1062,7 +1062,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, static void cpuset_change_nodemask(struct task_struct *p, struct cgroup_scanner *scan) { - struct cpuset *cs = cgroup_cs(scan->cg); + struct cpuset *cs = cgroup_cs(scan->cgrp); struct mm_struct *mm; int migrate; nodemask_t *newmems = scan->data; @@ -1102,7 +1102,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) guarantee_online_mems(mems_cs, &newmems); - scan.cg = cs->css.cgroup; + scan.cgrp = cs->css.cgroup; scan.test_task = NULL; scan.process_task = cpuset_change_nodemask; scan.heap = heap; @@ -1275,7 +1275,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) static void cpuset_change_flag(struct task_struct *tsk, struct cgroup_scanner *scan) { - cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); + cpuset_update_task_spread_flag(cgroup_cs(scan->cgrp), tsk); } /* @@ -1295,7 +1295,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) { struct cgroup_scanner scan; - scan.cg = cs->css.cgroup; + scan.cgrp = cs->css.cgroup; scan.test_task = NULL; scan.process_task = cpuset_change_flag; scan.heap = heap; @@ -1971,7 +1971,7 @@ static int cpuset_css_online(struct cgroup *cgrp) struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; - struct cgroup *pos_cg; + struct cgroup *pos_cgrp; if (!parent) return 0; @@ -2003,7 +2003,7 @@ static int cpuset_css_online(struct cgroup *cgrp) * (and likewise for mems) to the new cgroup. */ rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_cg, parent) { + cpuset_for_each_child(tmp_cs, pos_cgrp, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); goto out_unlock; -- cgit From 4e96ee8e981b5140a2bcc5fff0d5c0eef39a62ee Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 09:50:50 +0800 Subject: cgroup: convert cgroup_ida to cgroup_idr This enables us to lookup a cgroup by its id. v4: - add a comment for idr_remove() in cgroup_offline_fn(). v3: - on success, idr_alloc() returns the id but not 0, so fix the BUG_ON() in cgroup_init(). - pass the right value to idr_alloc() so that the id for dummy cgroup is 0. Signed-off-by: Li Zefan Reviewed-by: Michal Hocko Signed-off-by: Tejun Heo --- kernel/cgroup.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9577bebe2546..3f6593333525 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -866,8 +866,6 @@ static void cgroup_free_fn(struct work_struct *work) */ dput(cgrp->parent->dentry); - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - /* * Drop the active superblock reference that we took when we * created the cgroup. This will free cgrp->root, if we are @@ -1379,6 +1377,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) cgrp->root = root; RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); + idr_init(&root->cgroup_idr); } static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) @@ -1451,7 +1450,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) */ root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; - ida_init(&root->cgroup_ida); if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) @@ -1467,7 +1465,7 @@ static void cgroup_free_root(struct cgroupfs_root *root) /* hierarhcy ID shoulid already have been released */ WARN_ON_ONCE(root->hierarchy_id); - ida_destroy(&root->cgroup_ida); + idr_destroy(&root->cgroup_idr); kfree(root); } } @@ -1582,6 +1580,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); + root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, + 0, 1, GFP_KERNEL); + if (root_cgrp->id < 0) + goto unlock_drop; + /* Check for name clashes with existing mounts */ ret = -EBUSY; if (strlen(root->name)) @@ -4253,7 +4256,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_cgrp; rcu_assign_pointer(cgrp->name, name); - cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); if (cgrp->id < 0) goto err_free_name; @@ -4351,6 +4358,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } + idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); if (err) goto err_destroy; @@ -4377,7 +4386,7 @@ err_free_all: /* Release the reference count that we took on the superblock */ deactivate_super(sb); err_free_id: - ida_simple_remove(&root->cgroup_ida, cgrp->id); + idr_remove(&root->cgroup_idr, cgrp->id); err_free_name: kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: @@ -4570,6 +4579,14 @@ static void cgroup_offline_fn(struct work_struct *work) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); + /* + * We should remove the cgroup object from idr before its grace + * period starts, so we won't be looking up a cgroup while the + * cgroup is being freed. + */ + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + dput(d); set_bit(CGRP_RELEASABLE, &parent->flags); @@ -4895,6 +4912,10 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); + err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, + 0, 1, GFP_KERNEL); + BUG_ON(err < 0); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); -- cgit From 876ede8b2b9880615be0de3ec7b8afd0a1786e76 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Aug 2013 09:51:47 +0800 Subject: cgroup: restructure the failure path in cgroup_write_event_control() It uses a single label and checks the validity of each pointer. This is err-prone, and actually we had a bug because one of the check was insufficient. Use multi lables as we do in other places. v2: - drop initializations of local variables. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f6593333525..9f6dab22289e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3934,11 +3934,11 @@ static void cgroup_event_ptable_queue_proc(struct file *file, static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { - struct cgroup_event *event = NULL; + struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; - struct file *efile = NULL; - struct file *cfile = NULL; + struct file *efile; + struct file *cfile; char *endp; int ret; @@ -3964,31 +3964,31 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, efile = eventfd_fget(efd); if (IS_ERR(efile)) { ret = PTR_ERR(efile); - goto fail; + goto out_kfree; } event->eventfd = eventfd_ctx_fileget(efile); if (IS_ERR(event->eventfd)) { ret = PTR_ERR(event->eventfd); - goto fail; + goto out_put_efile; } cfile = fget(cfd); if (!cfile) { ret = -EBADF; - goto fail; + goto out_put_eventfd; } /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ ret = inode_permission(file_inode(cfile), MAY_READ); if (ret < 0) - goto fail; + goto out_put_cfile; event->cft = __file_cft(cfile); if (IS_ERR(event->cft)) { ret = PTR_ERR(event->cft); - goto fail; + goto out_put_cfile; } /* @@ -3998,18 +3998,18 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); if (cgrp_cfile != cgrp) { ret = -EINVAL; - goto fail; + goto out_put_cfile; } if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; - goto fail; + goto out_put_cfile; } ret = event->cft->register_event(cgrp, event->cft, event->eventfd, buffer); if (ret) - goto fail; + goto out_put_cfile; efile->f_op->poll(efile, &event->pt); @@ -4029,16 +4029,13 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, return 0; -fail: - if (cfile) - fput(cfile); - - if (event && event->eventfd && !IS_ERR(event->eventfd)) - eventfd_ctx_put(event->eventfd); - - if (!IS_ERR_OR_NULL(efile)) - fput(efile); - +out_put_cfile: + fput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fput(efile); +out_kfree: kfree(event); return ret; -- cgit From b395890a092d8ecbe54f005179e3dec4b6bf752a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Aug 2013 09:52:15 +0800 Subject: cgroup: rename cgroup_pidlist->mutex It's a rw_semaphore not a mutex. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9f6dab22289e..9420662df87e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3436,7 +3436,7 @@ struct cgroup_pidlist { /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* protects the other fields */ - struct rw_semaphore mutex; + struct rw_semaphore rwsem; }; /* @@ -3509,7 +3509,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, struct pid_namespace *ns = task_active_pid_ns(current); /* - * We can't drop the pidlist_mutex before taking the l->mutex in case + * We can't drop the pidlist_mutex before taking the l->rwsem in case * the last ref-holder is trying to remove l from the list at the same * time. Holding the pidlist_mutex precludes somebody taking whichever * list we find out from under us - compare release_pid_array(). @@ -3518,7 +3518,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, list_for_each_entry(l, &cgrp->pidlists, links) { if (l->key.type == type && l->key.ns == ns) { /* make sure l doesn't vanish out from under us */ - down_write(&l->mutex); + down_write(&l->rwsem); mutex_unlock(&cgrp->pidlist_mutex); return l; } @@ -3529,8 +3529,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, mutex_unlock(&cgrp->pidlist_mutex); return l; } - init_rwsem(&l->mutex); - down_write(&l->mutex); + init_rwsem(&l->rwsem); + down_write(&l->rwsem); l->key.type = type; l->key.ns = get_pid_ns(ns); l->owner = cgrp; @@ -3591,7 +3591,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, l->list = array; l->length = length; l->use_count++; - up_write(&l->mutex); + up_write(&l->rwsem); *lp = l; return 0; } @@ -3669,7 +3669,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) int index = 0, pid = *pos; int *iter; - down_read(&l->mutex); + down_read(&l->rwsem); if (pid) { int end = l->length; @@ -3696,7 +3696,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct cgroup_pidlist *l = s->private; - up_read(&l->mutex); + up_read(&l->rwsem); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) @@ -3742,7 +3742,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) * pidlist_mutex, we have to take pidlist_mutex first. */ mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->mutex); + down_write(&l->rwsem); BUG_ON(!l->use_count); if (!--l->use_count) { /* we're the last user if refcount is 0; remove and free */ @@ -3750,12 +3750,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) mutex_unlock(&l->owner->pidlist_mutex); pidlist_free(l->list); put_pid_ns(l->key.ns); - up_write(&l->mutex); + up_write(&l->rwsem); kfree(l); return; } mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->mutex); + up_write(&l->rwsem); } static int cgroup_pidlist_release(struct inode *inode, struct file *file) -- cgit From 8af01f56a03e9cbd91a55d688fce1315021efba8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cgroup: s/cgroup_subsys_state/cgroup_css/ s/task_subsys_state/task_css/ The names of the two struct cgroup_subsys_state accessors - cgroup_subsys_state() and task_subsys_state() - are somewhat awkward. The former clashes with the type name and the latter doesn't even indicate it's somehow related to cgroup. We're about to revamp large portion of cgroup API, so, let's rename them so that they're less awkward. Most per-controller usages of the accessors are localized in accessor wrappers and given the amount of scheduled changes, this isn't gonna add any noticeable headache. Rename cgroup_subsys_state() to cgroup_css() and task_subsys_state() to task_css(). This patch is pure rename. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 2 +- kernel/cgroup_freezer.c | 4 ++-- kernel/cpuset.c | 6 +++--- kernel/events/core.c | 6 +++--- kernel/sched/core.c | 4 ++-- kernel/sched/cpuacct.c | 4 ++-- kernel/sched/sched.h | 6 +++--- 7 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ae4c46834633..0b3caa3220cb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -81,7 +81,7 @@ */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ +EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ #else static DEFINE_MUTEX(cgroup_mutex); #endif diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 75dda1ea5026..9d3f61566fec 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -47,13 +47,13 @@ struct freezer { static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) { - return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), + return container_of(cgroup_css(cgroup, freezer_subsys_id), struct freezer, css); } static inline struct freezer *task_freezer(struct task_struct *task) { - return container_of(task_subsys_state(task, freezer_subsys_id), + return container_of(task_css(task, freezer_subsys_id), struct freezer, css); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1b9c31549797..be4512ba2c0c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -117,14 +117,14 @@ struct cpuset { /* Retrieve the cpuset for a cgroup */ static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), + return container_of(cgroup_css(cgrp, cpuset_subsys_id), struct cpuset, css); } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return container_of(task_subsys_state(task, cpuset_subsys_id), + return container_of(task_css(task, cpuset_subsys_id), struct cpuset, css); } @@ -2724,7 +2724,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) goto out_free; rcu_read_lock(); - css = task_subsys_state(tsk, cpuset_subsys_id); + css = task_css(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); rcu_read_unlock(); if (retval < 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index 1833bc5a84a7..414c61f4d776 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -340,8 +340,8 @@ struct perf_cgroup { static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { - return container_of(task_subsys_state(task, perf_subsys_id), - struct perf_cgroup, css); + return container_of(task_css(task, perf_subsys_id), + struct perf_cgroup, css); } static inline bool @@ -7798,7 +7798,7 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) static void perf_cgroup_css_free(struct cgroup *cont) { struct perf_cgroup *jc; - jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), + jc = container_of(cgroup_css(cont, perf_subsys_id), struct perf_cgroup, css); free_percpu(jc->info); kfree(jc); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9b1f2e533b95..323d907eac1a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6761,7 +6761,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7086,7 +7086,7 @@ int sched_rt_handler(struct ctl_table *table, int write, /* return corresponding task_group object of a cgroup */ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), + return container_of(cgroup_css(cgrp, cpu_cgroup_subsys_id), struct task_group, css); } diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dbb7e2cd95eb..4a210faaab77 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -36,14 +36,14 @@ struct cpuacct { /* return cpu accounting group corresponding to this container */ static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + return container_of(cgroup_css(cgrp, cpuacct_subsys_id), struct cpuacct, css); } /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + return container_of(task_css(tsk, cpuacct_subsys_id), struct cpuacct, css); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef0a7b2439dd..471a56db05ea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We cannot use task_subsys_state() and friends because the cgroup - * subsystem changes that value before the cgroup_subsys::attach() method - * is called, therefore we cannot pin it and might observe the wrong value. + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. * * The same is true for autogroup's p->signal->autogroup->tg, the autogroup * core changes this before calling sched_move_task(). -- cgit From c9710d8018273b0740e0794858f1961fcea5e61a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cpuset: drop "const" qualifiers from struct cpuset instances cpuset uses "const" qualifiers on struct cpuset in some functions; however, it doesn't work well when a value derived from returned const pointer has to be passed to an accessor. It's C after all. Drop the "const" qualifiers except for the trivially leaf ones. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index be4512ba2c0c..f7371341d42a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -128,7 +128,7 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } -static inline struct cpuset *parent_cs(const struct cpuset *cs) +static inline struct cpuset *parent_cs(struct cpuset *cs) { struct cgroup *pcgrp = cs->css.cgroup->parent; @@ -319,8 +319,7 @@ static struct file_system_type cpuset_fs_type = { * * Call with callback_mutex held. */ -static void guarantee_online_cpus(const struct cpuset *cs, - struct cpumask *pmask) +static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = parent_cs(cs); @@ -338,7 +337,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, * * Call with callback_mutex held. */ -static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) +static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) cs = parent_cs(cs); @@ -383,7 +382,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * alloc_trial_cpuset - allocate a trial cpuset * @cs: the cpuset that the trial cpuset duplicates */ -static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) +static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) { struct cpuset *trial; @@ -430,7 +429,7 @@ static void free_trial_cpuset(struct cpuset *trial) * Return 0 if valid, -errno if not. */ -static int validate_change(const struct cpuset *cur, const struct cpuset *trial) +static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup *cgrp; struct cpuset *c, *par; @@ -2343,7 +2342,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - const struct cpuset *cpus_cs; + struct cpuset *cpus_cs; rcu_read_lock(); cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); @@ -2416,7 +2415,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ -static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) +static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) { while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) cs = parent_cs(cs); @@ -2486,7 +2485,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) */ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) { - const struct cpuset *cs; /* current cpuset ancestors */ + struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) -- cgit From 72c97e54e0f043d33b246d7460ae0a36c4b8c643 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cgroup: add subsystem pointer to cgroup_subsys_state Currently, given a cgroup_subsys_state, there's no way to find out which subsystem the css is for, which we'll need to convert the cgroup controller API to primarily use @css instead of @cgroup. This patch adds cgroup_subsys_state->ss which points to the subsystem the @css belongs to. While at it, remove the comment about accessing @css->cgroup to determine the hierarchy. cgroup core will provide API to traverse hierarchy of css'es and we don't want subsystems to directly walk cgroup hierarchies anymore. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0b3caa3220cb..4234428f1014 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4186,6 +4186,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup *cgrp) { css->cgroup = cgrp; + css->ss = ss; css->flags = 0; css->id = NULL; if (cgrp == cgroup_dummy_top) -- cgit From a7c6d554aa01236ac2a9f851ab0f75704f76dfa2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add/update accessors which obtain subsys specific data from css css (cgroup_subsys_state) is usually embedded in a subsys specific data structure. Subsystems either use container_of() directly to cast from css to such data structure or has an accessor function wrapping such cast. As cgroup as whole is moving towards using css as the main interface handle, add and update such accessors to ease dealing with css's. All accessors explicitly handle NULL input and return NULL in those cases. While this looks like an extra branch in the code, as all controllers specific data structures have css as the first field, the casting doesn't involve any offsetting and the compiler can trivially optimize out the branch. * blkio, freezer, cpuset, cpu, cpuacct and net_cls didn't have such accessor. Added. * memory, hugetlb and devices already had one but didn't explicitly handle NULL input. Updated. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup_freezer.c | 11 +++++++---- kernel/cpuset.c | 11 +++++++---- kernel/sched/core.c | 8 ++++++-- kernel/sched/cpuacct.c | 11 +++++++---- 4 files changed, 27 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 9d3f61566fec..1db686e47a22 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -45,16 +45,19 @@ struct freezer { spinlock_t lock; }; +static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct freezer, css) : NULL; +} + static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) { - return container_of(cgroup_css(cgroup, freezer_subsys_id), - struct freezer, css); + return css_freezer(cgroup_css(cgroup, freezer_subsys_id)); } static inline struct freezer *task_freezer(struct task_struct *task) { - return container_of(task_css(task, freezer_subsys_id), - struct freezer, css); + return css_freezer(task_css(task, freezer_subsys_id)); } static struct freezer *parent_freezer(struct freezer *freezer) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f7371341d42a..6e9cbdde25bd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -114,18 +114,21 @@ struct cpuset { int relax_domain_level; }; +static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuset, css) : NULL; +} + /* Retrieve the cpuset for a cgroup */ static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) { - return container_of(cgroup_css(cgrp, cpuset_subsys_id), - struct cpuset, css); + return css_cs(cgroup_css(cgrp, cpuset_subsys_id)); } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return container_of(task_css(task, cpuset_subsys_id), - struct cpuset, css); + return css_cs(task_css(task, cpuset_subsys_id)); } static inline struct cpuset *parent_cs(struct cpuset *cs) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 323d907eac1a..5bccb0277129 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7083,11 +7083,15 @@ int sched_rt_handler(struct ctl_table *table, int write, #ifdef CONFIG_CGROUP_SCHED +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + /* return corresponding task_group object of a cgroup */ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) { - return container_of(cgroup_css(cgrp, cpu_cgroup_subsys_id), - struct task_group, css); + return css_tg(cgroup_css(cgrp, cpu_cgroup_subsys_id)); } static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 4a210faaab77..8ccfa10cc89f 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -33,18 +33,21 @@ struct cpuacct { struct kernel_cpustat __percpu *cpustat; }; +static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuacct, css) : NULL; +} + /* return cpu accounting group corresponding to this container */ static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { - return container_of(cgroup_css(cgrp, cpuacct_subsys_id), - struct cpuacct, css); + return css_ca(cgroup_css(cgrp, cpuacct_subsys_id)); } /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return container_of(task_css(tsk, cpuacct_subsys_id), - struct cpuacct, css); + return css_ca(task_css(tsk, cpuacct_subsys_id)); } static inline struct cpuacct *__parent_ca(struct cpuacct *ca) -- cgit From 6387698699afd72d6304566fb6ccf84bffe07c56 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add css_parent() Currently, controllers have to explicitly follow the cgroup hierarchy to find the parent of a given css. cgroup is moving towards using cgroup_subsys_state as the main controller interface construct, so let's provide a way to climb the hierarchy using just csses. This patch implements css_parent() which, given a css, returns its parent. The function is guarnateed to valid non-NULL parent css as long as the target css is not at the top of the hierarchy. freezer, cpuset, cpu, cpuacct, hugetlb, memory, net_cls and devices are converted to use css_parent() instead of accessing cgroup->parent directly. * __parent_ca() is dropped from cpuacct and its usage is replaced with parent_ca(). The only difference between the two was NULL test on cgroup->parent which is now embedded in css_parent() making the distinction moot. Note that eventually a css->parent field will be added to css and the NULL check in css_parent() will go away. This patch shouldn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup_freezer.c | 8 ++------ kernel/cpuset.c | 6 +----- kernel/sched/core.c | 9 +++------ kernel/sched/cpuacct.c | 11 ++--------- 4 files changed, 8 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 1db686e47a22..657a73cd44c4 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -62,11 +62,7 @@ static inline struct freezer *task_freezer(struct task_struct *task) static struct freezer *parent_freezer(struct freezer *freezer) { - struct cgroup *pcg = freezer->css.cgroup->parent; - - if (pcg) - return cgroup_freezer(pcg); - return NULL; + return css_freezer(css_parent(&freezer->css)); } bool cgroup_freezing(struct task_struct *task) @@ -234,7 +230,7 @@ static void freezer_fork(struct task_struct *task) * The root cgroup is non-freezable, so we can skip the * following check. */ - if (!freezer->css.cgroup->parent) + if (!parent_freezer(freezer)) goto out; spin_lock_irq(&freezer->lock); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6e9cbdde25bd..259a4af37e69 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -133,11 +133,7 @@ static inline struct cpuset *task_cs(struct task_struct *task) static inline struct cpuset *parent_cs(struct cpuset *cs) { - struct cgroup *pcgrp = cs->css.cgroup->parent; - - if (pcgrp) - return cgroup_cs(pcgrp); - return NULL; + return css_cs(css_parent(&cs->css)); } #ifdef CONFIG_NUMA diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5bccb0277129..7a10742b389a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7114,13 +7114,10 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) static int cpu_cgroup_css_online(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); - struct task_group *parent; + struct task_group *parent = css_tg(css_parent(&tg->css)); - if (!cgrp->parent) - return 0; - - parent = cgroup_tg(cgrp->parent); - sched_online_group(tg, parent); + if (parent) + sched_online_group(tg, parent); return 0; } diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 8ccfa10cc89f..f6926a149a71 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -50,16 +50,9 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) return css_ca(task_css(tsk, cpuacct_subsys_id)); } -static inline struct cpuacct *__parent_ca(struct cpuacct *ca) -{ - return cgroup_ca(ca->css.cgroup->parent); -} - static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - if (!ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); + return css_ca(css_parent(&ca->css)); } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); @@ -284,7 +277,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) while (ca != &root_cpuacct) { kcpustat = this_cpu_ptr(ca->cpustat); kcpustat->cpustat[index] += val; - ca = __parent_ca(ca); + ca = parent_ca(ca); } rcu_read_unlock(); } -- cgit From eb95419b023abacb415e2a18fea899023ce7624d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt --- kernel/cgroup.c | 57 +++++++++++++++++++++++++++++-------------------- kernel/cgroup_freezer.c | 40 +++++++++++++++++----------------- kernel/cpuset.c | 39 +++++++++++++++++---------------- kernel/events/core.c | 18 +++++++++------- kernel/sched/core.c | 39 ++++++++++++++++----------------- kernel/sched/cpuacct.c | 9 ++++---- 6 files changed, 111 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4234428f1014..271d9a5cde5f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -853,8 +853,11 @@ static void cgroup_free_fn(struct work_struct *work) /* * Release the subsystem state objects. */ - for_each_root_subsys(cgrp->root, ss) - ss->css_free(cgrp); + for_each_root_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + ss->css_free(css); + } cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1056,7 +1059,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgrp); + ss->bind(cgrp->subsys[i]); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; @@ -1066,7 +1069,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_dummy_top); + ss->bind(cgroup_dummy_top->subsys[i]); cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; cgrp->subsys[i] = NULL; cgroup_subsys[i]->root = &cgroup_dummy_root; @@ -2049,8 +2052,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss->can_attach) { - retval = ss->can_attach(cgrp, &tset); + retval = ss->can_attach(css, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2089,8 +2094,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss->attach) - ss->attach(cgrp, &tset); + ss->attach(css, &tset); } /* @@ -2109,10 +2116,12 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(cgrp, &tset); + ss->cancel_attach(css, &tset); } } out_free_group_list: @@ -4206,14 +4215,15 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; int ret = 0; lockdep_assert_held(&cgroup_mutex); if (ss->css_online) - ret = ss->css_online(cgrp); + ret = ss->css_online(css); if (!ret) - cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + css->flags |= CSS_ONLINE; return ret; } @@ -4228,9 +4238,9 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return; if (ss->css_offline) - ss->css_offline(cgrp); + ss->css_offline(css); - cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; + css->flags &= ~CSS_ONLINE; } /* @@ -4305,7 +4315,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(cgrp); + css = ss->css_alloc(parent->subsys[ss->subsys_id]); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4313,7 +4323,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = percpu_ref_init(&css->refcnt, css_release); if (err) { - ss->css_free(cgrp); + ss->css_free(css); goto err_free_all; } @@ -4386,7 +4396,7 @@ err_free_all: if (css) { percpu_ref_cancel_init(&css->refcnt); - ss->css_free(cgrp); + ss->css_free(css); } } mutex_unlock(&cgroup_mutex); @@ -4641,7 +4651,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_dummy_top); + css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, cgroup_dummy_top); @@ -4720,7 +4730,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_dummy_top); + css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -4836,7 +4846,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_dummy_top); + ss->css_free(cgroup_dummy_top->subsys[ss->subsys_id]); cgroup_dummy_top->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -5192,10 +5202,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) */ for_each_builtin_subsys(ss, i) { if (ss->exit) { - struct cgroup *old_cgrp = cset->subsys[i]->cgroup; - struct cgroup *cgrp = task_cgroup(tsk, i); + struct cgroup_subsys_state *old_css = cset->subsys[i]; + struct cgroup_subsys_state *css = task_css(tsk, i); - ss->exit(cgrp, old_cgrp, tsk); + ss->exit(css, old_css, tsk); } } } @@ -5529,7 +5539,8 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5539,9 +5550,9 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) return css; } -static void debug_css_free(struct cgroup *cgrp) +static void debug_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp->subsys[debug_subsys_id]); + kfree(css); } static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 657a73cd44c4..f03a85719c3c 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -91,7 +91,8 @@ static const char *freezer_state_strs(unsigned int state) struct cgroup_subsys freezer_subsys; -static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +freezer_css_alloc(struct cgroup_subsys_state *parent_css) { struct freezer *freezer; @@ -104,16 +105,16 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) } /** - * freezer_css_online - commit creation of a freezer cgroup - * @cgroup: cgroup being created + * freezer_css_online - commit creation of a freezer css + * @css: css being created * - * We're committing to creation of @cgroup. Mark it online and inherit + * We're committing to creation of @css. Mark it online and inherit * parent's freezing state while holding both parent's and our * freezer->lock. */ -static int freezer_css_online(struct cgroup *cgroup) +static int freezer_css_online(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); /* @@ -140,15 +141,15 @@ static int freezer_css_online(struct cgroup *cgroup) } /** - * freezer_css_offline - initiate destruction of @cgroup - * @cgroup: cgroup being destroyed + * freezer_css_offline - initiate destruction of a freezer css + * @css: css being destroyed * - * @cgroup is going away. Mark it dead and decrement system_freezing_count - * if it was holding one. + * @css is going away. Mark it dead and decrement system_freezing_count if + * it was holding one. */ -static void freezer_css_offline(struct cgroup *cgroup) +static void freezer_css_offline(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); spin_lock_irq(&freezer->lock); @@ -160,9 +161,9 @@ static void freezer_css_offline(struct cgroup *cgroup) spin_unlock_irq(&freezer->lock); } -static void freezer_css_free(struct cgroup *cgroup) +static void freezer_css_free(struct cgroup_subsys_state *css) { - kfree(cgroup_freezer(cgroup)); + kfree(css_freezer(css)); } /* @@ -174,25 +175,26 @@ static void freezer_css_free(struct cgroup *cgroup) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_subsys_state *new_css, + struct cgroup_taskset *tset) { - struct freezer *freezer = cgroup_freezer(new_cgrp); + struct freezer *freezer = css_freezer(new_css); struct task_struct *task; bool clear_frozen = false; spin_lock_irq(&freezer->lock); /* - * Make the new tasks conform to the current state of @new_cgrp. + * Make the new tasks conform to the current state of @new_css. * For simplicity, when migrating any task to a FROZEN cgroup, we * revert it to FREEZING and let update_if_frozen() determine the * correct state later. * - * Tasks in @tset are on @new_cgrp but may not conform to its + * Tasks in @tset are on @new_css but may not conform to its * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_cgrp, tset) { + cgroup_taskset_for_each(task, new_css->cgroup, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 259a4af37e69..8ce3fdc3dfcc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1455,9 +1455,10 @@ static int fmeter_getrate(struct fmeter *fmp) } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct task_struct *task; int ret; @@ -1468,11 +1469,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * flag is set. */ ret = -ENOSPC; - if (!cgroup_sane_behavior(cgrp) && + if (!cgroup_sane_behavior(css->cgroup) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1501,11 +1502,11 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup *cgrp, +static void cpuset_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { mutex_lock(&cpuset_mutex); - cgroup_cs(cgrp)->attach_in_progress--; + css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); } @@ -1516,7 +1517,8 @@ static void cpuset_cancel_attach(struct cgroup *cgrp, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; @@ -1524,7 +1526,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *oldcs = cgroup_cs(oldcgrp); struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct cpuset *mems_cs = effective_nodemask_cpuset(cs); @@ -1539,7 +1541,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1940,11 +1942,12 @@ static struct cftype files[] = { * cgrp: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuset *cs; - if (!cgrp->parent) + if (!parent_css) return &top_cpuset.css; cs = kzalloc(sizeof(*cs), GFP_KERNEL); @@ -1964,9 +1967,9 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) return &cs->css; } -static int cpuset_css_online(struct cgroup *cgrp) +static int cpuset_css_online(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; struct cgroup *pos_cgrp; @@ -1984,7 +1987,7 @@ static int cpuset_css_online(struct cgroup *cgrp) number_of_cpusets++; - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; /* @@ -2024,9 +2027,9 @@ out_unlock: * will call rebuild_sched_domains_locked(). */ -static void cpuset_css_offline(struct cgroup *cgrp) +static void cpuset_css_offline(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -2039,9 +2042,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) mutex_unlock(&cpuset_mutex); } -static void cpuset_css_free(struct cgroup *cgrp) +static void cpuset_css_free(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); free_cpumask_var(cs->cpus_allowed); kfree(cs); diff --git a/kernel/events/core.c b/kernel/events/core.c index 414c61f4d776..9705a0ed1dce 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7778,7 +7778,8 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state * +perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct perf_cgroup *jc; @@ -7795,11 +7796,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) return &jc->css; } -static void perf_cgroup_css_free(struct cgroup *cont) +static void perf_cgroup_css_free(struct cgroup_subsys_state *css) { - struct perf_cgroup *jc; - jc = container_of(cgroup_css(cont, perf_subsys_id), - struct perf_cgroup, css); + struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); + free_percpu(jc->info); kfree(jc); } @@ -7811,15 +7811,17 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, css->cgroup, tset) task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, +static void perf_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, struct task_struct *task) { /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7a10742b389a..622b7efc5ade 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7094,16 +7094,17 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) return css_tg(cgroup_css(cgrp, cpu_cgroup_subsys_id)); } -static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct task_group *tg, *parent; + struct task_group *parent = css_tg(parent_css); + struct task_group *tg; - if (!cgrp->parent) { + if (!parent) { /* This is early initialization for the top cgroup */ return &root_task_group.css; } - parent = cgroup_tg(cgrp->parent); tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); @@ -7111,38 +7112,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup *cgrp) +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); - struct task_group *parent = css_tg(css_parent(&tg->css)); + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css_parent(css)); if (parent) sched_online_group(tg, parent); return 0; } -static void cpu_cgroup_css_free(struct cgroup *cgrp) +static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_destroy_group(tg); } -static void cpu_cgroup_css_offline(struct cgroup *cgrp) +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_offline_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { #ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) + if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -7153,18 +7154,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, css->cgroup, tset) sched_move_task(task); } -static void -cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, - struct task_struct *task) +static void cpu_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f6926a149a71..1b784d9b3630 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -62,11 +62,12 @@ static struct cpuacct root_cpuacct = { }; /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuacct *ca; - if (!cgrp->parent) + if (!parent_css) return &root_cpuacct.css; ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -92,9 +93,9 @@ out: } /* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) +static void cpuacct_css_free(struct cgroup_subsys_state *css) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); free_percpu(ca->cpustat); free_percpu(ca->cpuusage); -- cgit From 2bb566cb68dfafad328af666ebadf0e49accd6ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add subsys backlink pointer to cftype cgroup is transitioning to using css (cgroup_subsys_state) instead of cgroup as the primary subsystem handle. The cgroupfs file interface will be converted to use css's which requires finding out the subsystem from cftype so that the matching css can be determined from the cgroup. This patch adds cftype->ss which points to the subsystem the file belongs to. The field is initialized while a cftype is being registered. This makes it unnecessary to explicitly specify the subsystem for other cftype handling functions. @ss argument dropped from various cftype handling functions. This patch shouldn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Vivek Goyal Cc: Jens Axboe --- kernel/cgroup.c | 78 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 271d9a5cde5f..c4bc8dac3b1d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -219,8 +219,8 @@ static struct cftype cgroup_base_files[]; static void cgroup_offline_fn(struct work_struct *work); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add); +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add); /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) @@ -974,7 +974,7 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, NULL, set->cfts, false); + cgroup_addrm_files(cgrp, set->cfts, false); } } @@ -1623,7 +1623,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cred = override_creds(&init_cred); - ret = cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); if (ret) goto rm_base_files; @@ -1681,7 +1681,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, rm_base_files: free_cgrp_cset_links(&tmp_links); - cgroup_addrm_files(&root->top_cgroup, NULL, cgroup_base_files, false); + cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); revert_creds(cred); unlock_drop: cgroup_exit_root_id(root); @@ -2694,8 +2694,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype *cft) +static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct cgroup *parent = __d_cgrp(dir); @@ -2705,8 +2704,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { - strcpy(name, subsys->name); + if (cft->ss && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { + strcpy(name, cft->ss->name); strcat(name, "."); } strcat(name, cft->name); @@ -2743,17 +2742,16 @@ out: /** * cgroup_addrm_files - add or remove files to a cgroup directory * @cgrp: the target cgroup - * @subsys: the subsystem of files to be added * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. - * All @cfts should belong to @subsys. For removals, this function never - * fails. If addition fails, this function doesn't remove files already - * added. The caller is responsible for cleaning up. + * For removals, this function never fails. If addition fails, this + * function doesn't remove files already added. The caller is responsible + * for cleaning up. */ -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add) +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add) { struct cftype *cft; int ret; @@ -2771,7 +2769,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, continue; if (is_add) { - ret = cgroup_add_file(cgrp, subsys, cft); + ret = cgroup_add_file(cgrp, cft); if (ret) { pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", cft->name, ret); @@ -2796,11 +2794,11 @@ static void cgroup_cfts_prepare(void) mutex_lock(&cgroup_mutex); } -static int cgroup_cfts_commit(struct cgroup_subsys *ss, - struct cftype *cfts, bool is_add) +static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) { LIST_HEAD(pending); + struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *cgrp, *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; struct dentry *prev = NULL; @@ -2828,7 +2826,7 @@ static int cgroup_cfts_commit(struct cgroup_subsys *ss, inode = root->dentry->d_inode; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - ret = cgroup_addrm_files(root, ss, cfts, is_add); + ret = cgroup_addrm_files(root, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -2851,7 +2849,7 @@ static int cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - ret = cgroup_addrm_files(cgrp, ss, cfts, is_add); + ret = cgroup_addrm_files(cgrp, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -2883,51 +2881,56 @@ out_deact: int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; + struct cftype *cft; int ret; set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; + for (cft = cfts; cft->name[0] != '\0'; cft++) + cft->ss = ss; + cgroup_cfts_prepare(); set->cfts = cfts; list_add_tail(&set->node, &ss->cftsets); - ret = cgroup_cfts_commit(ss, cfts, true); + ret = cgroup_cfts_commit(cfts, true); if (ret) - cgroup_rm_cftypes(ss, cfts); + cgroup_rm_cftypes(cfts); return ret; } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem - * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * - * Unregister @cfts from @ss. Files described by @cfts are removed from - * all existing cgroups to which @ss is attached and all future cgroups - * won't have them either. This function can be called anytime whether @ss - * is attached or not. + * Unregister @cfts. Files described by @cfts are removed from all + * existing cgroups and all future cgroups won't have them either. This + * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not - * registered with @ss. + * registered. */ -int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_rm_cftypes(struct cftype *cfts) { struct cftype_set *set; + if (!cfts || !cfts[0].ss) + return -ENOENT; + cgroup_cfts_prepare(); - list_for_each_entry(set, &ss->cftsets, node) { + list_for_each_entry(set, &cfts[0].ss->cftsets, node) { if (set->cfts == cfts) { list_del(&set->node); kfree(set); - cgroup_cfts_commit(ss, cfts, false); + cgroup_cfts_commit(cfts, false); return 0; } } - cgroup_cfts_commit(ss, NULL, false); + cgroup_cfts_commit(NULL, false); return -ENOENT; } @@ -4148,7 +4151,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) continue; list_for_each_entry(set, &ss->cftsets, node) { - ret = cgroup_addrm_files(cgrp, ss, set->cfts, true); + ret = cgroup_addrm_files(cgrp, set->cfts, true); if (ret < 0) goto err; } @@ -4377,7 +4380,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); + err = cgroup_addrm_files(cgrp, cgroup_base_files, true); if (err) goto err_destroy; @@ -4538,7 +4541,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * but we aren't quite done with @cgrp yet, so hold onto it. */ cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); - cgroup_addrm_files(cgrp, NULL, cgroup_base_files, false); + cgroup_addrm_files(cgrp, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); @@ -4632,6 +4635,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) * deregistration. */ if (ss->base_cftypes) { + struct cftype *cft; + + for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) + cft->ss = ss; + ss->base_cftset.cfts = ss->base_cftypes; list_add_tail(&ss->base_cftset.node, &ss->cftsets); } -- cgit From f7d58818ba4249f04a83b73aaac135640050bb4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: pin cgroup_subsys_state when opening a cgroupfs file Previously, each file read/write operation relied on the inode reference count pinning the cgroup and simply checked whether the cgroup was marked dead before proceeding to invoke the per-subsystem callback. This was rather silly as it didn't have any synchronization or css pinning around the check and the cgroup may be removed and all css refs drained between the DEAD check and actual method invocation. This patch pins the css between open() and release() so that it is guaranteed to be alive for all file operations and remove the silly DEAD checks from cgroup_file_read/write(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c4bc8dac3b1d..583f8f66a7e1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2277,6 +2277,17 @@ static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, return 0; } +/* return the css for the given cgroup file */ +static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) +{ + struct cftype *cft = cfe->type; + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + + if (cft->ss) + return cgrp->subsys[cft->ss->subsys_id]; + return NULL; +} + /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 @@ -2354,8 +2365,6 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_dead(cgrp)) - return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) @@ -2399,9 +2408,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_dead(cgrp)) - return -ENODEV; - if (cft->read) return cft->read(cgrp, cft, file, buf, nbytes, ppos); if (cft->read_u64) @@ -2447,15 +2453,22 @@ static const struct file_operations cgroup_seqfile_operations = { static int cgroup_file_open(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); int err; - struct cfent *cfe; - struct cftype *cft; err = generic_file_open(inode, file); if (err) return err; - cfe = __d_cfe(file->f_dentry); - cft = cfe->type; + + /* + * If the file belongs to a subsystem, pin the css. Will be + * unpinned either on open failure or release. This ensures that + * @css stays alive for all file operations. + */ + if (css && !css_tryget(css)) + return -ENODEV; if (cft->read_map || cft->read_seq_string) { file->f_op = &cgroup_seqfile_operations; @@ -2464,15 +2477,23 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = cft->open(inode, file); } + if (css && err) + css_put(css); return err; } static int cgroup_file_release(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); + int ret = 0; + if (cft->release) - return cft->release(inode, file); - return 0; + ret = cft->release(inode, file); + if (css) + css_put(css); + return ret; } /* -- cgit From 67f4c36f83455b253445b2cb28ac9a2c4f85d99a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: add cgroup->dummy_css cgroup subsystem API is being converted to use css (cgroup_subsys_state) as the main handle, which makes things a bit awkward for subsystem agnostic core features - the "cgroup.*" interface files and various iterations - a bit awkward as they don't have a css to use. This patch adds cgroup->dummy_css which has NULL ->ss and whose only role is pointing back to the cgroup. This will be used to support subsystem agnostic features on the coming css based API. css_parent() is updated to handle dummy_css's. Note that css will soon grow its own ->parent field and css_parent() will be made trivial. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 583f8f66a7e1..c049992f1ffa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1365,6 +1365,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); + cgrp->dummy_css.cgroup = cgrp; INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); simple_xattrs_init(&cgrp->xattrs); @@ -2285,7 +2286,7 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) if (cft->ss) return cgrp->subsys[cft->ss->subsys_id]; - return NULL; + return &cgrp->dummy_css; } /* A buffer size big enough for numbers or short strings */ @@ -2467,7 +2468,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * unpinned either on open failure or release. This ensures that * @css stays alive for all file operations. */ - if (css && !css_tryget(css)) + if (css->ss && !css_tryget(css)) return -ENODEV; if (cft->read_map || cft->read_seq_string) { @@ -2477,7 +2478,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = cft->open(inode, file); } - if (css && err) + if (css->ss && err) css_put(css); return err; } @@ -2491,7 +2492,7 @@ static int cgroup_file_release(struct inode *inode, struct file *file) if (cft->release) ret = cft->release(inode, file); - if (css) + if (css->ss) css_put(css); return ret; } -- cgit From 182446d087906de40e514573a92a97b203695f71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in file methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup. Please see the previous commit which converts the subsystem methods for rationale. This patch converts all cftype file operations to take @css instead of @cgroup. cftypes for the cgroup core files don't have their subsytem pointer set. These will automatically use the dummy_css added by the previous patch and can be converted the same way. Most subsystem conversions are straight forwards but there are some interesting ones. * freezer: update_if_frozen() is also converted to take @css instead of @cgroup for consistency. This will make the code look simpler too once iterators are converted to use css. * memory/vmpressure: mem_cgroup_from_css() needs to be exported to vmpressure while mem_cgroup_from_cont() can be made static. Updated accordingly. * cpu: cgroup_tg() doesn't have any user left. Removed. * cpuacct: cgroup_ca() doesn't have any user left. Removed. * hugetlb: hugetlb_cgroup_form_cgroup() doesn't have any user left. Removed. * net_cls: cgrp_cls_state() doesn't have any user left. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt --- kernel/cgroup.c | 162 +++++++++++++++++++++++++----------------------- kernel/cgroup_freezer.c | 40 ++++++------ kernel/cpuset.c | 35 ++++++----- kernel/sched/core.c | 65 ++++++++++--------- kernel/sched/cpuacct.c | 28 ++++----- 5 files changed, 165 insertions(+), 165 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c049992f1ffa..6ee469837fda 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2235,34 +2235,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +static int cgroup_tasks_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 pid) { - return attach_task_by_pid(cgrp, pid, false); + return attach_task_by_pid(css->cgroup, pid, false); } -static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) +static int cgroup_procs_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 tgid) { - return attach_task_by_pid(cgrp, tgid, true); + return attach_task_by_pid(css->cgroup, tgid, true); } -static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_release_agent_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); if (strlen(buffer) >= PATH_MAX) return -EINVAL; - if (!cgroup_lock_live_group(cgrp)) + if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; mutex_lock(&cgroup_root_mutex); - strcpy(cgrp->root->release_agent_path, buffer); + strcpy(css->cgroup->root->release_agent_path, buffer); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return 0; } -static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_release_agent_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { + struct cgroup *cgrp = css->cgroup; + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); @@ -2271,10 +2275,10 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return 0; } -static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); return 0; } @@ -2292,10 +2296,10 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 -static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2313,22 +2317,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_u64(cgrp, cft, val); + retval = cft->write_u64(css, cft, val); } else { s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_s64(cgrp, cft, val); + retval = cft->write_s64(css, cft, val); } if (!retval) retval = nbytes; return retval; } -static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2351,7 +2355,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - retval = cft->write_string(cgrp, cft, strstrip(buffer)); + retval = cft->write_string(css, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: @@ -2361,60 +2365,60 @@ out: } static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->write) - return cft->write(cgrp, cft, file, buf, nbytes, ppos); + return cft->write(css, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); if (cft->write_string) - return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_string(css, cft, file, buf, nbytes, ppos); if (cft->trigger) { - int ret = cft->trigger(cgrp, (unsigned int)cft->private); + int ret = cft->trigger(css, (unsigned int)cft->private); return ret ? ret : nbytes; } return -EINVAL; } -static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(cgrp, cft); + u64 val = cft->read_u64(css, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } -static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(cgrp, cft); + s64 val = cft->read_s64(css, cft); int len = sprintf(tmp, "%lld\n", (long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->read) - return cft->read(cgrp, cft, file, buf, nbytes, ppos); + return cft->read(css, cft, file, buf, nbytes, ppos); if (cft->read_u64) - return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); if (cft->read_s64) - return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); return -EINVAL; } @@ -2433,16 +2437,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cfent *cfe = m->private; struct cftype *cft = cfe->type; - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->read_map) { struct cgroup_map_cb cb = { .fill = cgroup_map_add, .state = m, }; - return cft->read_map(cgrp, cft, &cb); + return cft->read_map(css, cft, &cb); } - return cft->read_seq_string(cgrp, cft, m); + return cft->read_seq_string(css, cft, m); } static const struct file_operations cgroup_seqfile_operations = { @@ -3860,21 +3864,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file) return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); } -static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) { - return notify_on_release(cgrp); + return notify_on_release(css->cgroup); } -static int cgroup_write_notify_on_release(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - clear_bit(CGRP_RELEASABLE, &cgrp->flags); + clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } @@ -3972,9 +3975,10 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_write_event_control(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { + struct cgroup *cgrp = css->cgroup; struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; @@ -4082,20 +4086,19 @@ out_kfree: return ret; } -static u64 cgroup_clone_children_read(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } -static int cgroup_clone_children_write(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { if (val) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else - clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } @@ -5585,17 +5588,19 @@ static void debug_css_free(struct cgroup_subsys_state *css) kfree(css); } -static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return cgroup_task_count(cgrp); + return cgroup_task_count(css->cgroup); } -static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) { return (u64)(unsigned long)current->cgroups; } -static u64 current_css_set_refcount_read(struct cgroup *cgrp, +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, struct cftype *cft) { u64 count; @@ -5606,7 +5611,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp, return count; } -static int current_css_set_cg_links_read(struct cgroup *cgrp, +static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *seq) { @@ -5633,14 +5638,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp, } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup *cgrp, - struct cftype *cft, - struct seq_file *seq) +static int cgroup_css_links_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) { + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; @@ -5659,9 +5663,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp, return 0; } -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return test_bit(CGRP_RELEASABLE, &cgrp->flags); + return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); } static struct cftype debug_files[] = { diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f03a85719c3c..19613ba51444 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -245,7 +245,7 @@ out: /** * update_if_frozen - update whether a cgroup finished freezing - * @cgroup: cgroup of interest + * @css: css of interest * * Once FREEZING is initiated, transition to FROZEN is lazily updated by * calling this function. If the current state is FREEZING but not FROZEN, @@ -256,12 +256,12 @@ out: * update_if_frozen() on all descendants prior to invoking this function. * * Task states and freezer state might disagree while tasks are being - * migrated into or out of @cgroup, so we can't verify task states against + * migrated into or out of @css, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ -static void update_if_frozen(struct cgroup *cgroup) +static void update_if_frozen(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); struct cgroup *pos; struct cgroup_iter it; struct task_struct *task; @@ -275,7 +275,7 @@ static void update_if_frozen(struct cgroup *cgroup) goto out_unlock; /* are all (live) children frozen? */ - cgroup_for_each_child(pos, cgroup) { + cgroup_for_each_child(pos, css->cgroup) { struct freezer *child = cgroup_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && @@ -284,9 +284,9 @@ static void update_if_frozen(struct cgroup *cgroup) } /* are all tasks frozen? */ - cgroup_iter_start(cgroup, &it); + cgroup_iter_start(css->cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { + while ((task = cgroup_iter_next(css->cgroup, &it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -301,12 +301,12 @@ static void update_if_frozen(struct cgroup *cgroup) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_iter_end(cgroup, &it); + cgroup_iter_end(css->cgroup, &it); out_unlock: spin_unlock_irq(&freezer->lock); } -static int freezer_read(struct cgroup *cgroup, struct cftype *cft, +static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { struct cgroup *pos; @@ -314,13 +314,13 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, rcu_read_lock(); /* update states bottom-up */ - cgroup_for_each_descendant_post(pos, cgroup) - update_if_frozen(pos); - update_if_frozen(cgroup); + cgroup_for_each_descendant_post(pos, css->cgroup) + update_if_frozen(cgroup_css(pos, freezer_subsys_id)); + update_if_frozen(css); rcu_read_unlock(); - seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); + seq_puts(m, freezer_state_strs(css_freezer(css)->state)); seq_putc(m, '\n'); return 0; } @@ -426,7 +426,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) rcu_read_unlock(); } -static int freezer_write(struct cgroup *cgroup, struct cftype *cft, +static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { bool freeze; @@ -438,20 +438,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, else return -EINVAL; - freezer_change_state(cgroup_freezer(cgroup), freeze); + freezer_change_state(css_freezer(css), freeze); return 0; } -static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) +static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_SELF); } -static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) +static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_PARENT); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8ce3fdc3dfcc..89b76e1d3aa1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1603,9 +1603,10 @@ typedef enum { FILE_SPREAD_SLAB, } cpuset_filetype_t; -static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; int retval = -ENODEV; @@ -1650,9 +1651,10 @@ out_unlock: return retval; } -static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) +static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; int retval = -ENODEV; @@ -1676,10 +1678,10 @@ out_unlock: /* * Common handling for a write to a "cpus" or "mems" file. */ -static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cpuset_write_resmask(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *trialcs; int retval = -ENODEV; @@ -1758,13 +1760,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) return count; } -static ssize_t cpuset_common_file_read(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, - char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1794,9 +1795,9 @@ out: return retval; } -static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1825,9 +1826,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) return 0; } -static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) +static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 622b7efc5ade..cc9a49266382 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7088,12 +7088,6 @@ static inline struct task_group *css_tg(struct cgroup_subsys_state *css) return css ? container_of(css, struct task_group, css) : NULL; } -/* return corresponding task_group object of a cgroup */ -static inline struct task_group *cgroup_tg(struct cgroup *cgrp) -{ - return css_tg(cgroup_css(cgrp, cpu_cgroup_subsys_id)); -} - static struct cgroup_subsys_state * cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -7179,15 +7173,16 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css, } #ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 shareval) +static int cpu_shares_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); + return sched_group_set_shares(css_tg(css), scale_load(shareval)); } -static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); return (u64) scale_load_down(tg->shares); } @@ -7309,26 +7304,28 @@ long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } -static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_quota(cgroup_tg(cgrp)); + return tg_get_cfs_quota(css_tg(css)); } -static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, - s64 cfs_quota_us) +static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 cfs_quota_us) { - return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); + return tg_set_cfs_quota(css_tg(css), cfs_quota_us); } -static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_period(cgroup_tg(cgrp)); + return tg_get_cfs_period(css_tg(css)); } -static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 cfs_period_us) +static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_period_us) { - return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); + return tg_set_cfs_period(css_tg(css), cfs_period_us); } struct cfs_schedulable_data { @@ -7409,10 +7406,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, +static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; cb->fill(cb, "nr_periods", cfs_b->nr_periods); @@ -7425,26 +7422,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - s64 val) +static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, + struct cftype *cft, s64 val) { - return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); + return sched_group_set_rt_runtime(css_tg(css), val); } -static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_runtime(cgroup_tg(cgrp)); + return sched_group_rt_runtime(css_tg(css)); } -static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_period_us) +static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 rt_period_us) { - return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); + return sched_group_set_rt_period(css_tg(css), rt_period_us); } -static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_period(cgroup_tg(cgrp)); + return sched_group_rt_period(css_tg(css)); } #endif /* CONFIG_RT_GROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 1b784d9b3630..f64722ff0299 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -38,12 +38,6 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) return css ? container_of(css, struct cpuacct, css) : NULL; } -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return css_ca(cgroup_css(cgrp, cpuacct_subsys_id)); -} - /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { @@ -138,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; int i; @@ -150,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) return totalcpuusage; } -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) +static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 reset) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int err = 0; int i; @@ -169,10 +163,10 @@ out: return err; } -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) +static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct cpuacct *ca = cgroup_ca(cgroup); + struct cpuacct *ca = css_ca(css); u64 percpu; int i; @@ -189,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) +static int cpuacct_stats_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct cgroup_map_cb *cb) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int cpu; s64 val = 0; -- cgit From 3b287a505ef4024634beb12a93773254909d5dae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: convert cgroup_next_sibling() to cgroup_next_child() cgroup is transitioning to using css (cgroup_subsys_state) as the main subsys interface handle instead of cgroup and the iterators will be updated to use css too. The iterators need to walk the cgroup hierarchy and return the css's matching the origin css, which is a bit cumbersome to open code. This patch converts cgroup_next_sibling() to cgroup_next_child() so that it can handle all steps of direct child iteration. This will be used to update iterators to take @css instead of @cgrp. In addition to the new iteration init handling, cgroup_next_child() is restructured so that the different branches share the end of iteration condition check. This patch doesn't change any behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 59 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6ee469837fda..dd55244952bd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3037,15 +3037,16 @@ static void cgroup_enable_task_cg_lists(void) } /** - * cgroup_next_sibling - find the next sibling of a given cgroup - * @pos: the current cgroup + * cgroup_next_child - find the next child of a given cgroup + * @pos: the current position (%NULL to initiate traversal) + * @cgrp: cgroup whose descendants to walk * - * This function returns the next sibling of @pos and should be called - * under RCU read lock. The only requirement is that @pos is accessible. - * The next sibling is guaranteed to be returned regardless of @pos's - * state. + * This function returns the next child of @cgrp and should be called under + * RCU read lock. The only requirement is that @cgrp and @pos are + * accessible. The next sibling is guaranteed to be returned regardless of + * their states. */ -struct cgroup *cgroup_next_sibling(struct cgroup *pos) +struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) { struct cgroup *next; @@ -3061,30 +3062,30 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) * safe to dereference from this RCU critical section. If * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed * to be visible as %true here. + * + * If @pos is dead, its next pointer can't be dereferenced; + * however, as each cgroup is given a monotonically increasing + * unique serial number and always appended to the sibling list, + * the next one can be found by walking the parent's children until + * we see a cgroup with higher serial number than @pos's. While + * this path can be slower, it's taken only when either the current + * cgroup is removed or iteration and removal race. */ - if (likely(!cgroup_is_dead(pos))) { + if (!pos) { + next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); + } else if (likely(!cgroup_is_dead(pos))) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); - if (&next->sibling != &pos->parent->children) - return next; - return NULL; + } else { + list_for_each_entry_rcu(next, &cgrp->children, sibling) + if (next->serial_nr > pos->serial_nr) + break; } - /* - * Can't dereference the next pointer. Each cgroup is given a - * monotonically increasing unique serial number and always - * appended to the sibling list, so the next one can be found by - * walking the parent's children until we see a cgroup with higher - * serial number than @pos's. - * - * While this path can be slow, it's taken only when either the - * current cgroup is removed or iteration and removal race. - */ - list_for_each_entry_rcu(next, &pos->parent->children, sibling) - if (next->serial_nr > pos->serial_nr) - return next; + if (&next->sibling != &cgrp->children) + return next; return NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_sibling); +EXPORT_SYMBOL_GPL(cgroup_next_child); /** * cgroup_next_descendant_pre - find the next descendant for pre-order walk @@ -3117,7 +3118,7 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != cgroup) { - next = cgroup_next_sibling(pos); + next = cgroup_next_child(pos, pos->parent); if (next) return next; pos = pos->parent; @@ -3198,7 +3199,7 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = cgroup_next_sibling(pos); + next = cgroup_next_child(pos, pos->parent); if (next) return cgroup_leftmost_descendant(next); @@ -4549,9 +4550,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to + * CGRP_DEAD assertion is depended upon by cgroup_next_child() to * resume iteration after dropping RCU read lock. See - * cgroup_next_sibling() for details. + * cgroup_next_child() for details. */ set_bit(CGRP_DEAD, &cgrp->flags); -- cgit From f48e3924dca268c677c4e338e5d91ad9e6fe6b9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: always use cgroup_next_child() to walk the children list There are several places where the children list is accessed directly. This patch converts those places to use cgroup_next_child(). This will help updating the hierarchy iterators to use @css instead of @cgrp. While cgroup_next_child() can be heavy in pathological cases - e.g. a lot of dead children, this shouldn't cause any noticeable behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dd55244952bd..2b7354faaca7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3112,7 +3112,7 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, pos = cgroup; /* visit the first child if exists */ - next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); + next = cgroup_next_child(NULL, pos); if (next) return next; @@ -3151,7 +3151,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; - list_for_each_entry_rcu(tmp, &last->children, sibling) + cgroup_for_each_child(tmp, last) pos = tmp; } while (pos); @@ -3165,8 +3165,7 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) do { last = pos; - pos = list_first_or_null_rcu(&pos->children, struct cgroup, - sibling); + pos = cgroup_next_child(NULL, pos); } while (pos); return last; -- cgit From 492eb21b98f88e411a8bb43d6edcd7d7022add10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:25 -0400 Subject: cgroup: make hierarchy iterators deal with cgroup_subsys_state instead of cgroup cgroup is currently in the process of transitioning to using css (cgroup_subsys_state) as the primary handle instead of cgroup in subsystem API. For hierarchy iterators, this is beneficial because * In most cases, css is the only thing subsystems care about anyway. * On the planned unified hierarchy, iterations for different subsystems will need to skip over different subtrees of the hierarchy depending on which subsystems are enabled on each cgroup. Passing around css makes it unnecessary to explicitly specify the subsystem in question as css is intersection between cgroup and subsystem * For the planned unified hierarchy, css's would need to be created and destroyed dynamically independent from cgroup hierarchy. Having cgroup core manage css iteration makes enforcing deref rules a lot easier. Most subsystem conversions are straight-forward. Noteworthy changes are * blkio: cgroup_to_blkcg() is no longer used. Removed. * freezer: cgroup_freezer() is no longer used. Removed. * devices: cgroup_to_devcgroup() is no longer used. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe --- kernel/cgroup.c | 131 +++++++++++++++++++++++++++--------------------- kernel/cgroup_freezer.c | 25 ++++----- kernel/cpuset.c | 58 ++++++++++----------- 3 files changed, 112 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2b7354faaca7..91eac33fac86 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2814,8 +2814,8 @@ static void cgroup_cfts_prepare(void) /* * Thanks to the entanglement with vfs inode locking, we can't walk * the existing cgroups under cgroup_mutex and create files. - * Instead, we use cgroup_for_each_descendant_pre() and drop RCU - * read lock before calling cgroup_addrm_files(). + * Instead, we use css_for_each_descendant_pre() and drop RCU read + * lock before calling cgroup_addrm_files(). */ mutex_lock(&cgroup_mutex); } @@ -2825,10 +2825,11 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) { LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; - struct cgroup *cgrp, *root = &ss->root->top_cgroup; + struct cgroup *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; struct dentry *prev = NULL; struct inode *inode; + struct cgroup_subsys_state *css; u64 update_before; int ret = 0; @@ -2861,7 +2862,9 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) /* add/rm files for all cgroups created before */ rcu_read_lock(); - cgroup_for_each_descendant_pre(cgrp, root) { + css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { + struct cgroup *cgrp = css->cgroup; + if (cgroup_is_dead(cgrp)) continue; @@ -3037,17 +3040,21 @@ static void cgroup_enable_task_cg_lists(void) } /** - * cgroup_next_child - find the next child of a given cgroup - * @pos: the current position (%NULL to initiate traversal) - * @cgrp: cgroup whose descendants to walk + * css_next_child - find the next child of a given css + * @pos_css: the current position (%NULL to initiate traversal) + * @parent_css: css whose children to walk * - * This function returns the next child of @cgrp and should be called under - * RCU read lock. The only requirement is that @cgrp and @pos are - * accessible. The next sibling is guaranteed to be returned regardless of - * their states. + * This function returns the next child of @parent_css and should be called + * under RCU read lock. The only requirement is that @parent_css and + * @pos_css are accessible. The next sibling is guaranteed to be returned + * regardless of their states. */ -struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) +struct cgroup_subsys_state * +css_next_child(struct cgroup_subsys_state *pos_css, + struct cgroup_subsys_state *parent_css) { + struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; + struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -3081,59 +3088,64 @@ struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) break; } - if (&next->sibling != &cgrp->children) - return next; - return NULL; + if (&next->sibling == &cgrp->children) + return NULL; + + if (parent_css->ss) + return cgroup_css(next, parent_css->ss->subsys_id); + else + return &next->dummy_css; } -EXPORT_SYMBOL_GPL(cgroup_next_child); +EXPORT_SYMBOL_GPL(css_next_child); /** - * cgroup_next_descendant_pre - find the next descendant for pre-order walk + * css_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * @root: css whose descendants to walk * - * To be used by cgroup_for_each_descendant_pre(). Find the next - * descendant to visit for pre-order traversal of @cgroup's descendants. + * To be used by css_for_each_descendant_pre(). Find the next descendant + * to visit for pre-order traversal of @root's descendants. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This * function will return the correct next descendant as long as both @pos - * and @cgroup are accessible and @pos is a descendant of @cgroup. + * and @root are accessible and @pos is a descendant of @root. */ -struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) { - struct cgroup *next; + struct cgroup_subsys_state *next; WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, pretend we just visited @cgroup */ + /* if first iteration, pretend we just visited @root */ if (!pos) - pos = cgroup; + pos = root; /* visit the first child if exists */ - next = cgroup_next_child(NULL, pos); + next = css_next_child(NULL, pos); if (next) return next; /* no child, visit my or the closest ancestor's next sibling */ - while (pos != cgroup) { - next = cgroup_next_child(pos, pos->parent); + while (pos != root) { + next = css_next_child(pos, css_parent(pos)); if (next) return next; - pos = pos->parent; + pos = css_parent(pos); } return NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** - * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup - * @pos: cgroup of interest + * css_rightmost_descendant - return the rightmost descendant of a css + * @pos: css of interest * - * Return the rightmost descendant of @pos. If there's no descendant, - * @pos is returned. This can be used during pre-order traversal to skip + * Return the rightmost descendant of @pos. If there's no descendant, @pos + * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * * While this function requires RCU read locking, it doesn't require the @@ -3141,9 +3153,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); * function will return the correct rightmost descendant as long as @pos is * accessible. */ -struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +struct cgroup_subsys_state * +css_rightmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last, *tmp; + struct cgroup_subsys_state *last, *tmp; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -3151,62 +3164,64 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; - cgroup_for_each_child(tmp, last) + css_for_each_child(tmp, last) pos = tmp; } while (pos); return last; } -EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); +EXPORT_SYMBOL_GPL(css_rightmost_descendant); -static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) +static struct cgroup_subsys_state * +css_leftmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last; + struct cgroup_subsys_state *last; do { last = pos; - pos = cgroup_next_child(NULL, pos); + pos = css_next_child(NULL, pos); } while (pos); return last; } /** - * cgroup_next_descendant_post - find the next descendant for post-order walk + * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * @root: css whose descendants to walk * - * To be used by cgroup_for_each_descendant_post(). Find the next - * descendant to visit for post-order traversal of @cgroup's descendants. + * To be used by css_for_each_descendant_post(). Find the next descendant + * to visit for post-order traversal of @root's descendants. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This * function will return the correct next descendant as long as both @pos * and @cgroup are accessible and @pos is a descendant of @cgroup. */ -struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) { - struct cgroup *next; + struct cgroup_subsys_state *next; WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, visit the leftmost descendant */ if (!pos) { - next = cgroup_leftmost_descendant(cgroup); - return next != cgroup ? next : NULL; + next = css_leftmost_descendant(root); + return next != root ? next : NULL; } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = cgroup_next_child(pos, pos->parent); + next = css_next_child(pos, css_parent(pos)); if (next) - return cgroup_leftmost_descendant(next); + return css_leftmost_descendant(next); /* no sibling left, visit parent */ - next = pos->parent; - return next != cgroup ? next : NULL; + next = css_parent(pos); + return next != root ? next : NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); +EXPORT_SYMBOL_GPL(css_next_descendant_post); void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) @@ -4549,9 +4564,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by cgroup_next_child() to + * CGRP_DEAD assertion is depended upon by css_next_child() to * resume iteration after dropping RCU read lock. See - * cgroup_next_child() for details. + * css_next_child() for details. */ set_bit(CGRP_DEAD, &cgrp->flags); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 19613ba51444..98ca48d9ceb4 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -50,11 +50,6 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) return css ? container_of(css, struct freezer, css) : NULL; } -static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) -{ - return css_freezer(cgroup_css(cgroup, freezer_subsys_id)); -} - static inline struct freezer *task_freezer(struct task_struct *task) { return css_freezer(task_css(task, freezer_subsys_id)); @@ -120,7 +115,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) /* * The following double locking and freezing state inheritance * guarantee that @cgroup can never escape ancestors' freezing - * states. See cgroup_for_each_descendant_pre() for details. + * states. See css_for_each_descendant_pre() for details. */ if (parent) spin_lock_irq(&parent->lock); @@ -262,7 +257,7 @@ out: static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); - struct cgroup *pos; + struct cgroup_subsys_state *pos; struct cgroup_iter it; struct task_struct *task; @@ -275,8 +270,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css) goto out_unlock; /* are all (live) children frozen? */ - cgroup_for_each_child(pos, css->cgroup) { - struct freezer *child = cgroup_freezer(pos); + css_for_each_child(pos, css) { + struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && !(child->state & CGROUP_FROZEN)) @@ -309,13 +304,13 @@ out_unlock: static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; rcu_read_lock(); /* update states bottom-up */ - cgroup_for_each_descendant_post(pos, css->cgroup) - update_if_frozen(cgroup_css(pos, freezer_subsys_id)); + css_for_each_descendant_post(pos, css) + update_if_frozen(pos); update_if_frozen(css); rcu_read_unlock(); @@ -396,7 +391,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, */ static void freezer_change_state(struct freezer *freezer, bool freeze) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; /* update @freezer */ spin_lock_irq(&freezer->lock); @@ -409,8 +404,8 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) * CGROUP_FREEZING_PARENT. */ rcu_read_lock(); - cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { - struct freezer *pos_f = cgroup_freezer(pos); + css_for_each_descendant_pre(pos, &freezer->css) { + struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); /* diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 89b76e1d3aa1..be4f5036ea5e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -210,29 +210,29 @@ static struct cpuset top_cpuset = { /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @parent_cs: target cpuset to walk children of * * Walk @child_cs through the online children of @parent_cs. Must be used * with RCU read locked. */ -#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ - cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ - if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) +#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ + css_for_each_child((pos_css), &(parent_cs)->css) \ + if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) /** * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants * @des_cs: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @root_cs: target cpuset to walk ancestor of * * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_cgrp by calling - * cgroup_rightmost_descendant() to skip subtree. + * with RCU read locked. The caller may modify @pos_css by calling + * css_rightmost_descendant() to skip subtree. */ -#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ - cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ - if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) +#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ + css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* * There are two global mutexes guarding cpuset structures - cpuset_mutex @@ -430,7 +430,7 @@ static void free_trial_cpuset(struct cpuset *trial) static int validate_change(struct cpuset *cur, struct cpuset *trial) { - struct cgroup *cgrp; + struct cgroup_subsys_state *css; struct cpuset *c, *par; int ret; @@ -438,7 +438,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* Each of our child cpusets must be a subset of us */ ret = -EBUSY; - cpuset_for_each_child(c, cgrp, cur) + cpuset_for_each_child(c, css, cur) if (!is_cpuset_subset(c, trial)) goto out; @@ -459,7 +459,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * overlap */ ret = -EINVAL; - cpuset_for_each_child(c, cgrp, par) { + cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) @@ -508,13 +508,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs) { struct cpuset *cp; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); continue; } @@ -589,7 +589,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; doms = NULL; dattr = NULL; @@ -618,7 +618,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csn = 0; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { + cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { /* * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The @@ -635,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csa[csn++] = cp; /* skip @cp's subtree */ - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); @@ -886,16 +886,16 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root, struct ptr_heap *heap) { struct cpuset *cp; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; if (update_root) update_tasks_cpumask(root_cs, heap); rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp have some CPU */ if (!cpumask_empty(cp->cpus_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); continue; } if (!css_tryget(&cp->css)) @@ -1143,16 +1143,16 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root, struct ptr_heap *heap) { struct cpuset *cp; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; if (update_root) update_tasks_nodemask(root_cs, heap); rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp have some CPU */ if (!nodes_empty(cp->mems_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); continue; } if (!css_tryget(&cp->css)) @@ -1973,7 +1973,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; if (!parent) return 0; @@ -2005,7 +2005,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) * (and likewise for mems) to the new cgroup. */ rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_cgrp, parent) { + cpuset_for_each_child(tmp_cs, pos_css, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); goto out_unlock; @@ -2252,10 +2252,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { struct cpuset *cs; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!css_tryget(&cs->css)) continue; rcu_read_unlock(); -- cgit From d515876e9d951d8cf7fc7c90db2967664bdc89ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: relocate cgroup_advance_iter() For some reason, cgroup_advance_iter() is standing lonely all away from its iter comrades. Relocate it. This is cosmetic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 91eac33fac86..d56d9363d4b3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2981,30 +2981,6 @@ int cgroup_task_count(const struct cgroup *cgrp) return count; } -/* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set - */ -static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) -{ - struct list_head *l = it->cset_link; - struct cgrp_cset_link *link; - struct css_set *cset; - - /* Advance to the next non-empty css_set */ - do { - l = l->next; - if (l == &cgrp->cset_links) { - it->cset_link = NULL; - return; - } - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } while (list_empty(&cset->tasks)); - it->cset_link = l; - it->task = cset->tasks.next; -} - /* * To reduce the fork() overhead for systems that are not actually * using their cgroups capability, we don't maintain the lists running @@ -3223,6 +3199,30 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, } EXPORT_SYMBOL_GPL(css_next_descendant_post); +/* + * Advance a list_head iterator. The iterator should be positioned at + * the start of a css_set + */ +static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) +{ + struct list_head *l = it->cset_link; + struct cgrp_cset_link *link; + struct css_set *cset; + + /* Advance to the next non-empty css_set */ + do { + l = l->next; + if (l == &cgrp->cset_links) { + it->cset_link = NULL; + return; + } + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } while (list_empty(&cset->tasks)); + it->cset_link = l; + it->task = cset->tasks.next; +} + void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) { -- cgit From 0942eeeef68f9493c1bcb1a52baf612b73fcf9fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: rename cgroup_iter to cgroup_task_iter cgroup now has multiple iterators and it's quite confusing to have something which walks over tasks of a single cgroup named cgroup_iter. Let's rename it to cgroup_task_iter. While at it, reformat / update comments and replace the overview comment above the interface function decls with proper function comments. Such overview can be useful but function comments should be more than enough here. This is pure rename and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- kernel/cgroup.c | 114 ++++++++++++++++++++++++++++++++---------------- kernel/cgroup_freezer.c | 24 +++++----- 2 files changed, 89 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d56d9363d4b3..15c93f9c9e57 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -367,9 +367,11 @@ static struct cgrp_cset_link init_cgrp_cset_link; static int cgroup_init_idr(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); -/* css_set_lock protects the list of css_set objects, and the - * chain of tasks off each css_set. Nests outside task->alloc_lock - * due to cgroup_iter_start() */ +/* + * css_set_lock protects the list of css_set objects, and the chain of + * tasks off each css_set. Nests outside task->alloc_lock due to + * cgroup_task_iter_start(). + */ static DEFINE_RWLOCK(css_set_lock); static int css_set_count; @@ -394,10 +396,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -/* We don't maintain the lists running through each css_set to its - * task until after the first call to cgroup_iter_start(). This - * reduces the fork()/exit() overhead for people who have cgroups - * compiled into their kernel but not actually in use */ +/* + * We don't maintain the lists running through each css_set to its task + * until after the first call to cgroup_task_iter_start(). This reduces + * the fork()/exit() overhead for people who have cgroups compiled into + * their kernel but not actually in use. + */ static int use_task_css_set_links __read_mostly; static void __put_css_set(struct css_set *cset, int taskexit) @@ -2982,10 +2986,10 @@ int cgroup_task_count(const struct cgroup *cgrp) } /* - * To reduce the fork() overhead for systems that are not actually - * using their cgroups capability, we don't maintain the lists running - * through each css_set to its tasks until we see the list actually - * used - in other words after the first call to cgroup_iter_start(). + * To reduce the fork() overhead for systems that are not actually using + * their cgroups capability, we don't maintain the lists running through + * each css_set to its tasks until we see the list actually used - in other + * words after the first call to cgroup_task_iter_start(). */ static void cgroup_enable_task_cg_lists(void) { @@ -3199,11 +3203,15 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, } EXPORT_SYMBOL_GPL(css_next_descendant_post); -/* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set +/** + * cgroup_advance_task_iter - advance a task itererator to the next css_set + * @cgrp: the cgroup to walk tasks of + * @it: the iterator to advance + * + * Advance @it to the next css_set to walk. */ -static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) +static void cgroup_advance_task_iter(struct cgroup *cgrp, + struct cgroup_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3223,7 +3231,21 @@ static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) it->task = cset->tasks.next; } -void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * cgroup_task_iter_start - initiate task iteration + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator to use + * + * Initiate iteration through the tasks of @cgrp. The caller can call + * cgroup_task_iter_next() to walk through the tasks until the function + * returns NULL. On completion of iteration, cgroup_task_iter_end() must + * be called. + * + * Note that this function acquires a lock which is released when the + * iteration finishes. The caller can't sleep while iteration is in + * progress. + */ +void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) __acquires(css_set_lock) { /* @@ -3236,11 +3258,20 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) read_lock(&css_set_lock); it->cset_link = &cgrp->cset_links; - cgroup_advance_iter(cgrp, it); + cgroup_advance_task_iter(cgrp, it); } -struct task_struct *cgroup_iter_next(struct cgroup *cgrp, - struct cgroup_iter *it) +/** + * cgroup_task_iter_next - return the next task for the iterator + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator being iterated + * + * The "next" function for task iteration. @it should have been + * initialized via cgroup_task_iter_start(). Returns NULL when the + * iteration reaches the end. + */ +struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, + struct cgroup_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3254,16 +3285,25 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, l = l->next; link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); if (l == &link->cset->tasks) { - /* We reached the end of this task list - move on to - * the next cg_cgroup_link */ - cgroup_advance_iter(cgrp, it); + /* + * We reached the end of this task list - move on to the + * next cgrp_cset_link. + */ + cgroup_advance_task_iter(cgrp, it); } else { it->task = l; } return res; } -void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * cgroup_task_iter_end - finish task iteration + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator to finish + * + * Finish task iteration started by cgroup_task_iter_start(). + */ +void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3312,7 +3352,7 @@ static inline int started_after(void *p1, void *p2) * Iterate through all the tasks in a cgroup, calling test_task() for each, * and if it returns true, call process_task() for it also. * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_iter_{start,next,end}() + * Effectively duplicates cgroup_task_iter_{start,next,end}() * but does not lock css_set_lock for the call to process_task(). * The struct cgroup_scanner may be embedded in any structure of the caller's * creation. @@ -3333,7 +3373,7 @@ static inline int started_after(void *p1, void *p2) int cgroup_scan_tasks(struct cgroup_scanner *scan) { int retval, i; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *p, *dropped; /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; @@ -3368,8 +3408,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_iter_start(scan->cgrp, &it); - while ((p = cgroup_iter_next(scan->cgrp, &it))) { + cgroup_task_iter_start(scan->cgrp, &it); + while ((p = cgroup_task_iter_next(scan->cgrp, &it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3402,7 +3442,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_iter_end(scan->cgrp, &it); + cgroup_task_iter_end(scan->cgrp, &it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3608,7 +3648,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, pid_t *array; int length; int pid, n = 0; /* used for populating the array */ - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; @@ -3623,8 +3663,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + cgroup_task_iter_start(cgrp, &it); + while ((tsk = cgroup_task_iter_next(cgrp, &it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3635,7 +3675,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_iter_end(cgrp, &it); + cgroup_task_iter_end(cgrp, &it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3669,7 +3709,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { int ret = -EINVAL; struct cgroup *cgrp; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *tsk; /* @@ -3683,8 +3723,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + cgroup_task_iter_start(cgrp, &it); + while ((tsk = cgroup_task_iter_next(cgrp, &it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3704,7 +3744,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_iter_end(cgrp, &it); + cgroup_task_iter_end(cgrp, &it); err: return ret; @@ -5137,7 +5177,7 @@ void cgroup_fork(struct task_struct *child) * Adds the task to the list running through its css_set if necessary and * call the subsystem fork() callbacks. Has to be after the task is * visible on the task list in case we race with the first call to - * cgroup_iter_start() - to guarantee that the new task ends up on its + * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ void cgroup_post_fork(struct task_struct *child) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 98ca48d9ceb4..c9177f8fc661 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -258,7 +258,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct cgroup_subsys_state *pos; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -279,9 +279,9 @@ static void update_if_frozen(struct cgroup_subsys_state *css) } /* are all tasks frozen? */ - cgroup_iter_start(css->cgroup, &it); + cgroup_task_iter_start(css->cgroup, &it); - while ((task = cgroup_iter_next(css->cgroup, &it))) { + while ((task = cgroup_task_iter_next(css->cgroup, &it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -296,7 +296,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_iter_end(css->cgroup, &it); + cgroup_task_iter_end(css->cgroup, &it); out_unlock: spin_unlock_irq(&freezer->lock); } @@ -323,25 +323,25 @@ static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, static void freeze_cgroup(struct freezer *freezer) { struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) + cgroup_task_iter_start(cgroup, &it); + while ((task = cgroup_task_iter_next(cgroup, &it))) freeze_task(task); - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); } static void unfreeze_cgroup(struct freezer *freezer) { struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) + cgroup_task_iter_start(cgroup, &it); + while ((task = cgroup_task_iter_next(cgroup, &it))) __thaw_task(task); - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); } /** -- cgit From c59cd3d840b1b0a8f996cbbd9132128dcaabbeb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make cgroup_task_iter remember the cgroup being iterated Currently all cgroup_task_iter functions require @cgrp to be passed in, which is superflous and increases chance of usage error. Make cgroup_task_iter remember the cgroup being iterated and drop @cgrp argument from next and end functions. This patch doesn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- kernel/cgroup.c | 32 +++++++++++++++----------------- kernel/cgroup_freezer.c | 12 ++++++------ 2 files changed, 21 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15c93f9c9e57..abc62ea1303c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3205,13 +3205,11 @@ EXPORT_SYMBOL_GPL(css_next_descendant_post); /** * cgroup_advance_task_iter - advance a task itererator to the next css_set - * @cgrp: the cgroup to walk tasks of * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ -static void cgroup_advance_task_iter(struct cgroup *cgrp, - struct cgroup_task_iter *it) +static void cgroup_advance_task_iter(struct cgroup_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3220,7 +3218,7 @@ static void cgroup_advance_task_iter(struct cgroup *cgrp, /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &cgrp->cset_links) { + if (l == &it->origin_cgrp->cset_links) { it->cset_link = NULL; return; } @@ -3257,21 +3255,22 @@ void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); + + it->origin_cgrp = cgrp; it->cset_link = &cgrp->cset_links; - cgroup_advance_task_iter(cgrp, it); + + cgroup_advance_task_iter(it); } /** * cgroup_task_iter_next - return the next task for the iterator - * @cgrp: the cgroup to walk tasks of * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been * initialized via cgroup_task_iter_start(). Returns NULL when the * iteration reaches the end. */ -struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, - struct cgroup_task_iter *it) +struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3289,7 +3288,7 @@ struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, * We reached the end of this task list - move on to the * next cgrp_cset_link. */ - cgroup_advance_task_iter(cgrp, it); + cgroup_advance_task_iter(it); } else { it->task = l; } @@ -3298,12 +3297,11 @@ struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, /** * cgroup_task_iter_end - finish task iteration - * @cgrp: the cgroup to walk tasks of * @it: the task iterator to finish * * Finish task iteration started by cgroup_task_iter_start(). */ -void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it) +void cgroup_task_iter_end(struct cgroup_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3409,7 +3407,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ heap->size = 0; cgroup_task_iter_start(scan->cgrp, &it); - while ((p = cgroup_task_iter_next(scan->cgrp, &it))) { + while ((p = cgroup_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3442,7 +3440,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_task_iter_end(scan->cgrp, &it); + cgroup_task_iter_end(&it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3664,7 +3662,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, return -ENOMEM; /* now, populate the array */ cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(cgrp, &it))) { + while ((tsk = cgroup_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3675,7 +3673,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_task_iter_end(cgrp, &it); + cgroup_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3724,7 +3722,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) cgrp = dentry->d_fsdata; cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(cgrp, &it))) { + while ((tsk = cgroup_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3744,7 +3742,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_task_iter_end(cgrp, &it); + cgroup_task_iter_end(&it); err: return ret; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index c9177f8fc661..e0ab9bfd679a 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -281,7 +281,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) /* are all tasks frozen? */ cgroup_task_iter_start(css->cgroup, &it); - while ((task = cgroup_task_iter_next(css->cgroup, &it))) { + while ((task = cgroup_task_iter_next(&it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -296,7 +296,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_task_iter_end(css->cgroup, &it); + cgroup_task_iter_end(&it); out_unlock: spin_unlock_irq(&freezer->lock); } @@ -327,9 +327,9 @@ static void freeze_cgroup(struct freezer *freezer) struct task_struct *task; cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(cgroup, &it))) + while ((task = cgroup_task_iter_next(&it))) freeze_task(task); - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); } static void unfreeze_cgroup(struct freezer *freezer) @@ -339,9 +339,9 @@ static void unfreeze_cgroup(struct freezer *freezer) struct task_struct *task; cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(cgroup, &it))) + while ((task = cgroup_task_iter_next(&it))) __thaw_task(task); - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); } /** -- cgit From e535837b1dae17b5a2d76ea1bc22ac1a79354624 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: remove struct cgroup_scanner cgroup_scan_tasks() takes a pointer to struct cgroup_scanner as its sole argument and the only function of that struct is packing the arguments of the function call which are consisted of five fields. It's not too unusual to pack parameters into a struct when the number of arguments gets excessive or the whole set needs to be passed around a lot, but neither holds here making it just weird. Drop struct cgroup_scanner and pass the params directly to cgroup_scan_tasks(). Note that struct cpuset_change_nodemask_arg was added to cpuset.c to pass both ->cs and ->newmems pointer to cpuset_change_nodemask() using single data pointer. This doesn't make any functional differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 93 ++++++++++++++++++++++++++------------------------------- kernel/cpuset.c | 63 ++++++++++++++++---------------------- 2 files changed, 69 insertions(+), 87 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index abc62ea1303c..7b16ddb2569b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3343,32 +3343,37 @@ static inline int started_after(void *p1, void *p2) /** * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @scan: struct cgroup_scanner containing arguments for the scan + * @cgrp: the cgroup to iterate tasks of + * @test: optional test callback + * @process: process callback + * @data: data passed to @test and @process + * @heap: optional pre-allocated heap used for task iteration * - * Arguments include pointers to callback functions test_task() and - * process_task(). - * Iterate through all the tasks in a cgroup, calling test_task() for each, - * and if it returns true, call process_task() for it also. - * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_task_iter_{start,next,end}() - * but does not lock css_set_lock for the call to process_task(). - * The struct cgroup_scanner may be embedded in any structure of the caller's - * creation. - * It is guaranteed that process_task() will act on every task that - * is a member of the cgroup for the duration of this call. This - * function may or may not call process_task() for tasks that exit - * or move to a different cgroup during the call, or are forked or - * move into the cgroup during the call. + * Iterate through all the tasks in a cgroup, calling @test for each, and + * if it returns %true, call @process for it also. * - * Note that test_task() may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should - * be cheap. - * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been - * pre-allocated and will be used for heap operations (and its "gt" member will - * be overwritten), else a temporary heap will be used (allocation of which - * may cause this function to fail). + * @test may be NULL, meaning always true (select all tasks), which + * effectively duplicates cgroup_task_iter_{start,next,end}() but does not + * lock css_set_lock for the call to @process. + * + * It is guaranteed that @process will act on every task that is a member + * of @cgrp for the duration of this call. This function may or may not + * call @process for tasks that exit or move to a different cgroup during + * the call, or are forked or move into the cgroup during the call. + * + * Note that @test may be called with locks held, and may in some + * situations be called multiple times for the same task, so it should be + * cheap. + * + * If @heap is non-NULL, a heap has been pre-allocated and will be used for + * heap operations (and its "gt" member will be overwritten), else a + * temporary heap will be used (allocation of which may cause this function + * to fail). */ -int cgroup_scan_tasks(struct cgroup_scanner *scan) +int cgroup_scan_tasks(struct cgroup *cgrp, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap) { int retval, i; struct cgroup_task_iter it; @@ -3376,12 +3381,10 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; struct ptr_heap tmp_heap; - struct ptr_heap *heap; struct timespec latest_time = { 0, 0 }; - if (scan->heap) { + if (heap) { /* The caller supplied our heap and pre-allocated its memory */ - heap = scan->heap; heap->gt = &started_after; } else { /* We need to allocate our own heap memory */ @@ -3394,25 +3397,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) again: /* - * Scan tasks in the cgroup, using the scanner's "test_task" callback - * to determine which are of interest, and using the scanner's - * "process_task" callback to process any of them that need an update. - * Since we don't want to hold any locks during the task updates, - * gather tasks to be processed in a heap structure. - * The heap is sorted by descending task start time. - * If the statically-sized heap fills up, we overflow tasks that - * started later, and in future iterations only consider tasks that - * started after the latest task in the previous pass. This + * Scan tasks in the cgroup, using the @test callback to determine + * which are of interest, and invoking @process callback on the + * ones which need an update. Since we don't want to hold any + * locks during the task updates, gather tasks to be processed in a + * heap structure. The heap is sorted by descending task start + * time. If the statically-sized heap fills up, we overflow tasks + * that started later, and in future iterations only consider tasks + * that started after the latest task in the previous pass. This * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_task_iter_start(scan->cgrp, &it); + cgroup_task_iter_start(cgrp, &it); while ((p = cgroup_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one */ - if (scan->test_task && !scan->test_task(p, scan)) + if (test && !test(p, data)) continue; /* * Only process tasks that started after the last task @@ -3450,7 +3452,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) latest_task = q; } /* Process the task per the caller's callback */ - scan->process_task(q, scan); + process(q, data); put_task_struct(q); } /* @@ -3467,10 +3469,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) return 0; } -static void cgroup_transfer_one_task(struct task_struct *task, - struct cgroup_scanner *scan) +static void cgroup_transfer_one_task(struct task_struct *task, void *data) { - struct cgroup *new_cgroup = scan->data; + struct cgroup *new_cgroup = data; mutex_lock(&cgroup_mutex); cgroup_attach_task(new_cgroup, task, false); @@ -3484,15 +3485,7 @@ static void cgroup_transfer_one_task(struct task_struct *task, */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - struct cgroup_scanner scan; - - scan.cgrp = from; - scan.test_task = NULL; /* select all tasks in cgroup */ - scan.process_task = cgroup_transfer_one_task; - scan.heap = NULL; - scan.data = to; - - return cgroup_scan_tasks(&scan); + return cgroup_scan_tasks(from, NULL, cgroup_transfer_one_task, to, NULL); } /* diff --git a/kernel/cpuset.c b/kernel/cpuset.c index be4f5036ea5e..6fe23f2ac742 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -830,7 +830,7 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) /** * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's * @tsk: task to test - * @scan: struct cgroup_scanner containing the cgroup of the task + * @data: cpuset to @tsk belongs to * * Called by cgroup_scan_tasks() for each task in a cgroup whose * cpus_allowed mask needs to be changed. @@ -838,12 +838,11 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. */ -static void cpuset_change_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) +static void cpuset_change_cpumask(struct task_struct *tsk, void *data) { - struct cpuset *cpus_cs; + struct cpuset *cs = data; + struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cgrp)); set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); } @@ -862,13 +861,8 @@ static void cpuset_change_cpumask(struct task_struct *tsk, */ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { - struct cgroup_scanner scan; - - scan.cgrp = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_cpumask; - scan.heap = heap; - cgroup_scan_tasks(&scan); + cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_cpumask, cs, + heap); } /* @@ -1052,20 +1046,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_unlock(tsk); } +struct cpuset_change_nodemask_arg { + struct cpuset *cs; + nodemask_t *newmems; +}; + /* * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy * of it to cpuset's new mems_allowed, and migrate pages to new nodes if * memory_migrate flag is set. Called with cpuset_mutex held. */ -static void cpuset_change_nodemask(struct task_struct *p, - struct cgroup_scanner *scan) +static void cpuset_change_nodemask(struct task_struct *p, void *data) { - struct cpuset *cs = cgroup_cs(scan->cgrp); + struct cpuset_change_nodemask_arg *arg = data; + struct cpuset *cs = arg->cs; struct mm_struct *mm; int migrate; - nodemask_t *newmems = scan->data; - cpuset_change_task_nodemask(p, newmems); + cpuset_change_task_nodemask(p, arg->newmems); mm = get_task_mm(p); if (!mm) @@ -1075,7 +1073,7 @@ static void cpuset_change_nodemask(struct task_struct *p, mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) - cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); + cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); mmput(mm); } @@ -1093,19 +1091,14 @@ static void *cpuset_being_rebound; static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) { static nodemask_t newmems; /* protected by cpuset_mutex */ - struct cgroup_scanner scan; struct cpuset *mems_cs = effective_nodemask_cpuset(cs); + struct cpuset_change_nodemask_arg arg = { .cs = cs, + .newmems = &newmems }; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ guarantee_online_mems(mems_cs, &newmems); - scan.cgrp = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_nodemask; - scan.heap = heap; - scan.data = &newmems; - /* * The mpol_rebind_mm() call takes mmap_sem, which we couldn't * take while holding tasklist_lock. Forks can happen - the @@ -1116,7 +1109,8 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - cgroup_scan_tasks(&scan); + cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_nodemask, &arg, + heap); /* * All the tasks' nodemasks have been updated, update @@ -1263,17 +1257,18 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) /* * cpuset_change_flag - make a task's spread flags the same as its cpuset's * @tsk: task to be updated - * @scan: struct cgroup_scanner containing the cgroup of the task + * @data: cpuset to @tsk belongs to * * Called by cgroup_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. */ -static void cpuset_change_flag(struct task_struct *tsk, - struct cgroup_scanner *scan) +static void cpuset_change_flag(struct task_struct *tsk, void *data) { - cpuset_update_task_spread_flag(cgroup_cs(scan->cgrp), tsk); + struct cpuset *cs = data; + + cpuset_update_task_spread_flag(cs, tsk); } /* @@ -1291,13 +1286,7 @@ static void cpuset_change_flag(struct task_struct *tsk, */ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) { - struct cgroup_scanner scan; - - scan.cgrp = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_flag; - scan.heap = heap; - cgroup_scan_tasks(&scan); + cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_flag, cs, heap); } /* -- cgit From 72ec7029937f0518eff21b8762743c31591684f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make task iterators deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. This patch converts task iterators to deal with css instead of cgroup. Note that under unified hierarchy, different sets of tasks will be considered belonging to a given cgroup depending on the subsystem in question and making the iterators deal with css instead cgroup provides them with enough information about the iteration. While at it, fix several function comment formats in cpuset.c. This patch doesn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley --- kernel/cgroup.c | 112 ++++++++++++++++++++++++------------------------ kernel/cgroup_freezer.c | 26 ++++++----- kernel/cpuset.c | 41 ++++++++---------- 3 files changed, 88 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b16ddb2569b..8c57301d0561 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -370,7 +370,7 @@ static int cgroup_init_idr(struct cgroup_subsys *ss, /* * css_set_lock protects the list of css_set objects, and the chain of * tasks off each css_set. Nests outside task->alloc_lock due to - * cgroup_task_iter_start(). + * css_task_iter_start(). */ static DEFINE_RWLOCK(css_set_lock); static int css_set_count; @@ -398,9 +398,9 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) /* * We don't maintain the lists running through each css_set to its task - * until after the first call to cgroup_task_iter_start(). This reduces - * the fork()/exit() overhead for people who have cgroups compiled into - * their kernel but not actually in use. + * until after the first call to css_task_iter_start(). This reduces the + * fork()/exit() overhead for people who have cgroups compiled into their + * kernel but not actually in use. */ static int use_task_css_set_links __read_mostly; @@ -2989,7 +2989,7 @@ int cgroup_task_count(const struct cgroup *cgrp) * To reduce the fork() overhead for systems that are not actually using * their cgroups capability, we don't maintain the lists running through * each css_set to its tasks until we see the list actually used - in other - * words after the first call to cgroup_task_iter_start(). + * words after the first call to css_task_iter_start(). */ static void cgroup_enable_task_cg_lists(void) { @@ -3204,12 +3204,12 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, EXPORT_SYMBOL_GPL(css_next_descendant_post); /** - * cgroup_advance_task_iter - advance a task itererator to the next css_set + * css_advance_task_iter - advance a task itererator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ -static void cgroup_advance_task_iter(struct cgroup_task_iter *it) +static void css_advance_task_iter(struct css_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3218,7 +3218,7 @@ static void cgroup_advance_task_iter(struct cgroup_task_iter *it) /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &it->origin_cgrp->cset_links) { + if (l == &it->origin_css->cgroup->cset_links) { it->cset_link = NULL; return; } @@ -3230,47 +3230,48 @@ static void cgroup_advance_task_iter(struct cgroup_task_iter *it) } /** - * cgroup_task_iter_start - initiate task iteration - * @cgrp: the cgroup to walk tasks of + * css_task_iter_start - initiate task iteration + * @css: the css to walk tasks of * @it: the task iterator to use * - * Initiate iteration through the tasks of @cgrp. The caller can call - * cgroup_task_iter_next() to walk through the tasks until the function - * returns NULL. On completion of iteration, cgroup_task_iter_end() must - * be called. + * Initiate iteration through the tasks of @css. The caller can call + * css_task_iter_next() to walk through the tasks until the function + * returns NULL. On completion of iteration, css_task_iter_end() must be + * called. * * Note that this function acquires a lock which is released when the * iteration finishes. The caller can't sleep while iteration is in * progress. */ -void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it) __acquires(css_set_lock) { /* - * The first time anyone tries to iterate across a cgroup, - * we need to enable the list linking each css_set to its - * tasks, and fix up all existing tasks. + * The first time anyone tries to iterate across a css, we need to + * enable the list linking each css_set to its tasks, and fix up + * all existing tasks. */ if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); - it->origin_cgrp = cgrp; - it->cset_link = &cgrp->cset_links; + it->origin_css = css; + it->cset_link = &css->cgroup->cset_links; - cgroup_advance_task_iter(it); + css_advance_task_iter(it); } /** - * cgroup_task_iter_next - return the next task for the iterator + * css_task_iter_next - return the next task for the iterator * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been - * initialized via cgroup_task_iter_start(). Returns NULL when the - * iteration reaches the end. + * initialized via css_task_iter_start(). Returns NULL when the iteration + * reaches the end. */ -struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) +struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3288,7 +3289,7 @@ struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) * We reached the end of this task list - move on to the * next cgrp_cset_link. */ - cgroup_advance_task_iter(it); + css_advance_task_iter(it); } else { it->task = l; } @@ -3296,12 +3297,12 @@ struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) } /** - * cgroup_task_iter_end - finish task iteration + * css_task_iter_end - finish task iteration * @it: the task iterator to finish * - * Finish task iteration started by cgroup_task_iter_start(). + * Finish task iteration started by css_task_iter_start(). */ -void cgroup_task_iter_end(struct cgroup_task_iter *it) +void css_task_iter_end(struct css_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3342,24 +3343,24 @@ static inline int started_after(void *p1, void *p2) } /** - * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @cgrp: the cgroup to iterate tasks of + * css_scan_tasks - iterate though all the tasks in a css + * @css: the css to iterate tasks of * @test: optional test callback * @process: process callback * @data: data passed to @test and @process * @heap: optional pre-allocated heap used for task iteration * - * Iterate through all the tasks in a cgroup, calling @test for each, and - * if it returns %true, call @process for it also. + * Iterate through all the tasks in @css, calling @test for each, and if it + * returns %true, call @process for it also. * * @test may be NULL, meaning always true (select all tasks), which - * effectively duplicates cgroup_task_iter_{start,next,end}() but does not + * effectively duplicates css_task_iter_{start,next,end}() but does not * lock css_set_lock for the call to @process. * * It is guaranteed that @process will act on every task that is a member - * of @cgrp for the duration of this call. This function may or may not - * call @process for tasks that exit or move to a different cgroup during - * the call, or are forked or move into the cgroup during the call. + * of @css for the duration of this call. This function may or may not + * call @process for tasks that exit or move to a different css during the + * call, or are forked or move into the css during the call. * * Note that @test may be called with locks held, and may in some * situations be called multiple times for the same task, so it should be @@ -3370,13 +3371,13 @@ static inline int started_after(void *p1, void *p2) * temporary heap will be used (allocation of which may cause this function * to fail). */ -int cgroup_scan_tasks(struct cgroup *cgrp, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap) +int css_scan_tasks(struct cgroup_subsys_state *css, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap) { int retval, i; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *p, *dropped; /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; @@ -3397,7 +3398,7 @@ int cgroup_scan_tasks(struct cgroup *cgrp, again: /* - * Scan tasks in the cgroup, using the @test callback to determine + * Scan tasks in the css, using the @test callback to determine * which are of interest, and invoking @process callback on the * ones which need an update. Since we don't want to hold any * locks during the task updates, gather tasks to be processed in a @@ -3408,8 +3409,8 @@ int cgroup_scan_tasks(struct cgroup *cgrp, * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_task_iter_start(cgrp, &it); - while ((p = cgroup_task_iter_next(&it))) { + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3442,7 +3443,7 @@ int cgroup_scan_tasks(struct cgroup *cgrp, * the heap and wasn't inserted */ } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3485,7 +3486,8 @@ static void cgroup_transfer_one_task(struct task_struct *task, void *data) */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - return cgroup_scan_tasks(from, NULL, cgroup_transfer_one_task, to, NULL); + return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, + to, NULL); } /* @@ -3639,7 +3641,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, pid_t *array; int length; int pid, n = 0; /* used for populating the array */ - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; @@ -3654,8 +3656,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(&it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3666,7 +3668,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3700,7 +3702,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { int ret = -EINVAL; struct cgroup *cgrp; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *tsk; /* @@ -3714,8 +3716,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(&it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3735,7 +3737,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); err: return ret; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e0ab9bfd679a..5cd2b6d55243 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -258,7 +258,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct cgroup_subsys_state *pos; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -279,9 +279,9 @@ static void update_if_frozen(struct cgroup_subsys_state *css) } /* are all tasks frozen? */ - cgroup_task_iter_start(css->cgroup, &it); + css_task_iter_start(css, &it); - while ((task = cgroup_task_iter_next(&it))) { + while ((task = css_task_iter_next(&it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -296,7 +296,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_task_iter_end(&it); + css_task_iter_end(&it); out_unlock: spin_unlock_irq(&freezer->lock); } @@ -322,26 +322,24 @@ static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, static void freeze_cgroup(struct freezer *freezer) { - struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(&it))) + css_task_iter_start(&freezer->css, &it); + while ((task = css_task_iter_next(&it))) freeze_task(task); - cgroup_task_iter_end(&it); + css_task_iter_end(&it); } static void unfreeze_cgroup(struct freezer *freezer) { - struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(&it))) + css_task_iter_start(&freezer->css, &it); + while ((task = css_task_iter_next(&it))) __thaw_task(task); - cgroup_task_iter_end(&it); + css_task_iter_end(&it); } /** diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6fe23f2ac742..39e52175f4af 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -832,8 +832,8 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) * @tsk: task to test * @data: cpuset to @tsk belongs to * - * Called by cgroup_scan_tasks() for each task in a cgroup whose - * cpus_allowed mask needs to be changed. + * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed + * mask needs to be changed. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. @@ -849,27 +849,26 @@ static void cpuset_change_cpumask(struct task_struct *tsk, void *data) /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * * Called with cpuset_mutex held * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * The css_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * No return value. It's guaranteed that css_scan_tasks() always returns 0 * if @heap != NULL. */ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { - cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_cpumask, cs, - heap); + css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); } /* * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. * @root_cs: the root cpuset of the hierarchy * @update_root: update root cpuset or not? - * @heap: the heap used by cgroup_scan_tasks() + * @heap: the heap used by css_scan_tasks() * * This will update cpumasks of tasks in @root_cs and all other empty cpusets * which take on cpumask of @root_cs. @@ -1082,11 +1081,10 @@ static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 - * if @heap != NULL. + * Called with cpuset_mutex held. No return value. It's guaranteed that + * css_scan_tasks() always returns 0 if @heap != NULL. */ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) { @@ -1109,8 +1107,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_nodemask, &arg, - heap); + css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); /* * All the tasks' nodemasks have been updated, update @@ -1126,7 +1123,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. * @cs: the root cpuset of the hierarchy * @update_root: update the root cpuset or not? - * @heap: the heap used by cgroup_scan_tasks() + * @heap: the heap used by css_scan_tasks() * * This will update nodemasks of tasks in @root_cs and all other empty cpusets * which take on nodemask of @root_cs. @@ -1254,12 +1251,12 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) return 0; } -/* +/** * cpuset_change_flag - make a task's spread flags the same as its cpuset's * @tsk: task to be updated * @data: cpuset to @tsk belongs to * - * Called by cgroup_scan_tasks() for each task in a cgroup. + * Called by css_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. @@ -1271,22 +1268,22 @@ static void cpuset_change_flag(struct task_struct *tsk, void *data) cpuset_update_task_spread_flag(cs, tsk); } -/* +/** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * * Called with cpuset_mutex held * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * The css_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * No return value. It's guaranteed that css_scan_tasks() always returns 0 * if @heap != NULL. */ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) { - cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_flag, cs, heap); + css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); } /* -- cgit From 81eeaf0411204f52af8ef78ff107cfca2fcfec1d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make cftype->[un]register_event() deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. cftype->[un]register_event() is among the remaining couple interfaces which still use struct cgroup. Convert it to cgroup_subsys_state. The conversion is mostly mechanical and removes the last users of mem_cgroup_from_cont() and cg_to_vmpressure(), which are removed. v2: indentation update as suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh --- kernel/cgroup.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c57301d0561..a71f2e0f9711 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -159,9 +159,9 @@ struct css_id { */ struct cgroup_event { /* - * Cgroup which the event belongs to. + * css which the event belongs to. */ - struct cgroup *cgrp; + struct cgroup_subsys_state *css; /* * Control file which the event associated. */ @@ -3955,11 +3955,12 @@ static void cgroup_event_remove(struct work_struct *work) { struct cgroup_event *event = container_of(work, struct cgroup_event, remove); - struct cgroup *cgrp = event->cgrp; + struct cgroup_subsys_state *css = event->css; + struct cgroup *cgrp = css->cgroup; remove_wait_queue(event->wqh, &event->wait); - event->cft->unregister_event(cgrp, event->cft, event->eventfd); + event->cft->unregister_event(css, event->cft, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); @@ -3979,7 +3980,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, { struct cgroup_event *event = container_of(wait, struct cgroup_event, wait); - struct cgroup *cgrp = event->cgrp; + struct cgroup *cgrp = event->css->cgroup; unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { @@ -4048,7 +4049,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; - event->cgrp = cgrp; + event->css = css; INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -4099,7 +4100,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } - ret = event->cft->register_event(cgrp, event->cft, + ret = event->cft->register_event(css, event->cft, event->eventfd, buffer); if (ret) goto out_put_cfile; -- cgit From d99c8727e7bbc01b70e2c57e6127bfab26b868fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: make cgroup_taskset deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. cgroup_taskset which is used by the subsystem attach methods is the last cgroup subsystem API which isn't using css as the handle. Update cgroup_taskset_cur_cgroup() to cgroup_taskset_cur_css() and cgroup_taskset_for_each() to take @skip_css instead of @skip_cgrp. The conversions are pretty mechanical. One exception is cpuset::cgroup_cs(), which lost its last user and got removed. This patch shouldn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Daniel Wagner Cc: Ingo Molnar Cc: Matt Helsley Cc: Steven Rostedt --- kernel/cgroup.c | 16 +++++++++------- kernel/cgroup_freezer.c | 2 +- kernel/cpuset.c | 15 +++++---------- kernel/events/core.c | 2 +- kernel/sched/core.c | 4 ++-- 5 files changed, 18 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a71f2e0f9711..e5bfb2a81dcb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1907,18 +1907,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) EXPORT_SYMBOL_GPL(cgroup_taskset_next); /** - * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task + * cgroup_taskset_cur_css - return the matching css for the current task * @tset: taskset of interest + * @subsys_id: the ID of the target subsystem * - * Return the cgroup for the current (last returned) task of @tset. This - * function must be preceded by either cgroup_taskset_first() or - * cgroup_taskset_next(). + * Return the css for the current (last returned) task of @tset for + * subsystem specified by @subsys_id. This function must be preceded by + * either cgroup_taskset_first() or cgroup_taskset_next(). */ -struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) +struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, + int subsys_id) { - return tset->cur_cgrp; + return cgroup_css(tset->cur_cgrp, subsys_id); } -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); +EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); /** * cgroup_taskset_size - return the number of tasks in taskset diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 5cd2b6d55243..224da9aa27f5 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -189,7 +189,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_css->cgroup, tset) { + cgroup_taskset_for_each(task, new_css, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 39e52175f4af..bf69717325b4 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -119,12 +119,6 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) return css ? container_of(css, struct cpuset, css) : NULL; } -/* Retrieve the cpuset for a cgroup */ -static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) -{ - return css_cs(cgroup_css(cgrp, cpuset_subsys_id)); -} - /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { @@ -1459,7 +1453,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1511,9 +1505,10 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); - struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); + struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, + cpuset_subsys_id); struct cpuset *cs = css_cs(css); - struct cpuset *oldcs = cgroup_cs(oldcgrp); + struct cpuset *oldcs = css_cs(oldcss); struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct cpuset *mems_cs = effective_nodemask_cpuset(cs); @@ -1527,7 +1522,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here diff --git a/kernel/events/core.c b/kernel/events/core.c index 9705a0ed1dce..c199c4f24910 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7816,7 +7816,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css->cgroup, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cc9a49266382..a7122d5b8310 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7135,7 +7135,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -7153,7 +7153,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css->cgroup, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } -- cgit From 95109b627ba6a043c181fa5fa45d1c754dd44fbc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: unexport cgroup_css() cgroup_css() no longer has any user left outside cgroup.c proper and we don't want subsystems to grow new usages of the function. cgroup core should always provide the css to use to the subsystems, which will make dynamic creation and destruction of css's across the lifetime of a cgroup much more manageable than exposing the cgroup directly to subsystems and let them dereference css's from it. Make cgroup_css() a static function in cgroup.c. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5bfb2a81dcb..c02a288a4e3d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -222,6 +222,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @subsys_id: the subsystem of interest + * + * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. + */ +static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + int subsys_id) +{ + return cgrp->subsys[subsys_id]; +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { -- cgit From bd8815a6d802fc16a7a106e170593aa05dc17e72 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: make css_for_each_descendant() and friends include the origin css in the iteration Previously, all css descendant iterators didn't include the origin (root of subtree) css in the iteration. The reasons were maintaining consistency with css_for_each_child() and that at the time of introduction more use cases needed skipping the origin anyway; however, given that css_is_descendant() considers self to be a descendant, omitting the origin css has become more confusing and looking at the accumulated use cases rather clearly indicates that including origin would result in simpler code overall. While this is a change which can easily lead to subtle bugs, cgroup API including the iterators has recently gone through major restructuring and no out-of-tree changes will be applicable without adjustments making this a relatively acceptable opportunity for this type of change. The conversions are mostly straight-forward. If the iteration block had explicit origin handling before or after, it's moved inside the iteration. If not, if (pos == origin) continue; is added. Some conversions add extra reference get/put around origin handling by consolidating origin handling and the rest. While the extra ref operations aren't strictly necessary, this shouldn't cause any noticeable difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Michal Hocko Cc: Jens Axboe Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- kernel/cgroup.c | 29 +++++++++++------------------ kernel/cgroup_freezer.c | 29 ++++++++++++++++------------- kernel/cpuset.c | 42 ++++++++++++++++++++++++++---------------- 3 files changed, 53 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c02a288a4e3d..52f0498db946 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2868,17 +2868,6 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) mutex_unlock(&cgroup_mutex); - /* @root always needs to be updated */ - inode = root->dentry->d_inode; - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - ret = cgroup_addrm_files(root, cfts, is_add); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - - if (ret) - goto out_deact; - /* add/rm files for all cgroups created before */ rcu_read_lock(); css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { @@ -2907,7 +2896,6 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) } rcu_read_unlock(); dput(prev); -out_deact: deactivate_super(sb); return ret; } @@ -3099,7 +3087,8 @@ EXPORT_SYMBOL_GPL(css_next_child); * @root: css whose descendants to walk * * To be used by css_for_each_descendant_pre(). Find the next descendant - * to visit for pre-order traversal of @root's descendants. + * to visit for pre-order traversal of @root's descendants. @root is + * included in the iteration and the first node to be visited. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This @@ -3114,9 +3103,9 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, pretend we just visited @root */ + /* if first iteration, visit @root */ if (!pos) - pos = root; + return root; /* visit the first child if exists */ next = css_next_child(NULL, pos); @@ -3186,7 +3175,8 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_post(). Find the next descendant - * to visit for post-order traversal of @root's descendants. + * to visit for post-order traversal of @root's descendants. @root is + * included in the iteration and the last node to be visited. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This @@ -3207,14 +3197,17 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return next != root ? next : NULL; } + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + /* if there's an unvisited sibling, visit its leftmost descendant */ next = css_next_child(pos, css_parent(pos)); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ - next = css_parent(pos); - return next != root ? next : NULL; + return css_parent(pos); } EXPORT_SYMBOL_GPL(css_next_descendant_post); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 224da9aa27f5..f0ff64d0ebaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -311,7 +311,6 @@ static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, /* update states bottom-up */ css_for_each_descendant_post(pos, css) update_if_frozen(pos); - update_if_frozen(css); rcu_read_unlock(); @@ -391,11 +390,6 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) { struct cgroup_subsys_state *pos; - /* update @freezer */ - spin_lock_irq(&freezer->lock); - freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); - spin_unlock_irq(&freezer->lock); - /* * Update all its descendants in pre-order traversal. Each * descendant will try to inherit its parent's FREEZING state as @@ -406,14 +400,23 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); - /* - * Our update to @parent->state is already visible which is - * all we need. No need to lock @parent. For more info on - * synchronization, see freezer_post_create(). - */ spin_lock_irq(&pos_f->lock); - freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, - CGROUP_FREEZING_PARENT); + + if (pos_f == freezer) { + freezer_apply_state(pos_f, freeze, + CGROUP_FREEZING_SELF); + } else { + /* + * Our update to @parent->state is already visible + * which is all we need. No need to lock @parent. + * For more info on synchronization, see + * freezer_post_create(). + */ + freezer_apply_state(pos_f, + parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + } + spin_unlock_irq(&pos_f->lock); } rcu_read_unlock(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index bf69717325b4..72a0383f382f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -222,7 +222,8 @@ static struct cpuset top_cpuset = { * * Walk @des_cs through the online descendants of @root_cs. Must be used * with RCU read locked. The caller may modify @pos_css by calling - * css_rightmost_descendant() to skip subtree. + * css_rightmost_descendant() to skip subtree. @root_cs is included in the + * iteration and the first node to be visited. */ #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ @@ -506,6 +507,9 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + if (cp == root_cs) + continue; + /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { pos_css = css_rightmost_descendant(pos_css); @@ -613,6 +617,8 @@ static int generate_sched_domains(cpumask_var_t **domains, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { + if (cp == &top_cpuset) + continue; /* * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The @@ -875,15 +881,17 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, struct cpuset *cp; struct cgroup_subsys_state *pos_css; - if (update_root) - update_tasks_cpumask(root_cs, heap); - rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp have some CPU */ - if (!cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; + if (cp == root_cs) { + if (!update_root) + continue; + } else { + /* skip the whole subtree if @cp have some CPU */ + if (!cpumask_empty(cp->cpus_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } } if (!css_tryget(&cp->css)) continue; @@ -1130,15 +1138,17 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, struct cpuset *cp; struct cgroup_subsys_state *pos_css; - if (update_root) - update_tasks_nodemask(root_cs, heap); - rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp have some CPU */ - if (!nodes_empty(cp->mems_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; + if (cp == root_cs) { + if (!update_root) + continue; + } else { + /* skip the whole subtree if @cp have some CPU */ + if (!nodes_empty(cp->mems_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } } if (!css_tryget(&cp->css)) continue; @@ -2237,7 +2247,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!css_tryget(&cs->css)) + if (cs == &top_cpuset || !css_tryget(&cs->css)) continue; rcu_read_unlock(); -- cgit From 40e93b39cd5b6a347333a95152ce37deef37bbd0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:53 -0400 Subject: cgroup: always use cgroup_css() cgroup_css() is the accessor for cgroup->subsys[] but is not used consistently. cgroup->subsys[] will become RCU protected and cgroup_css() will grow synchronization sanity checks. In preparation, make all cgroup->subsys[] dereferences use cgroup_css() consistently. This patch doesn't introduce any functional difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 58 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 52f0498db946..49ad96ee08e1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -574,7 +574,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ - template[i] = cgrp->subsys[i]; + template[i] = cgroup_css(cgrp, i); } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ @@ -871,7 +871,7 @@ static void cgroup_free_fn(struct work_struct *work) * Release the subsystem state objects. */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); ss->css_free(css); } @@ -1067,27 +1067,27 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgrp->subsys[i]); - BUG_ON(!cgroup_dummy_top->subsys[i]); - BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); + BUG_ON(cgroup_css(cgrp, i)); + BUG_ON(!cgroup_css(cgroup_dummy_top, i)); + BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; - cgrp->subsys[i]->cgroup = cgrp; + cgroup_css(cgrp, i)->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgrp->subsys[i]); + ss->bind(cgroup_css(cgrp, i)); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ - BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); - BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + BUG_ON(cgroup_css(cgrp, i) != cgroup_css(cgroup_dummy_top, i)); + BUG_ON(cgroup_css(cgrp, i)->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_dummy_top->subsys[i]); - cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; + ss->bind(cgroup_css(cgroup_dummy_top, i)); + cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; cgrp->subsys[i] = NULL; cgroup_subsys[i]->root = &cgroup_dummy_root; list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); @@ -2072,7 +2072,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss->can_attach) { retval = ss->can_attach(css, &tset); @@ -2114,7 +2114,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss->attach) ss->attach(css, &tset); @@ -2136,7 +2136,7 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss == failed_ss) break; @@ -2308,7 +2308,7 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); if (cft->ss) - return cgrp->subsys[cft->ss->subsys_id]; + return cgroup_css(cgrp, cft->ss->subsys_id); return &cgrp->dummy_css; } @@ -4241,7 +4241,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* This cgroup is ready now */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); struct css_id *id = rcu_dereference_protected(css->id, true); /* @@ -4285,7 +4285,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->id = NULL; if (cgrp == cgroup_dummy_top) css->flags |= CSS_ROOT; - BUG_ON(cgrp->subsys[ss->subsys_id]); + BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; /* @@ -4300,7 +4300,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -4315,7 +4315,7 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); lockdep_assert_held(&cgroup_mutex); @@ -4400,7 +4400,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(parent->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(parent, ss->subsys_id)); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4477,7 +4477,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (css) { percpu_ref_cancel_init(&css->refcnt); @@ -4590,7 +4590,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ atomic_set(&cgrp->css_kill_cnt, 1); for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); /* * Killing would put the base ref, but we need to keep it @@ -4676,7 +4676,7 @@ static void cgroup_offline_fn(struct work_struct *work) * destruction happens only after all css's are released. */ for_each_root_subsys(cgrp->root, ss) - css_put(cgrp->subsys[ss->subsys_id]); + css_put(cgroup_css(cgrp, ss->subsys_id)); /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); @@ -4741,7 +4741,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, cgroup_dummy_top); @@ -4820,7 +4820,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -4936,7 +4936,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_dummy_top->subsys[ss->subsys_id]); + ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); cgroup_dummy_top->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -5562,8 +5562,8 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, struct css_id *child_id, *parent_id; subsys_id = ss->subsys_id; - parent_css = parent->subsys[subsys_id]; - child_css = child->subsys[subsys_id]; + parent_css = cgroup_css(parent, subsys_id); + child_css = cgroup_css(child, subsys_id); parent_id = rcu_dereference_protected(parent_css->id, true); depth = parent_id->depth + 1; @@ -5624,7 +5624,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) /* get cgroup */ cgrp = __d_cgrp(f->f_dentry); - css = cgrp->subsys[id]; + css = cgroup_css(cgrp, id); return css ? css : ERR_PTR(-ENOENT); } -- cgit From 35ef10da65d43211f4cd7e7822cbb3becdfc0ae1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: rename cgroup_subsys_state->dput_work and its callback function css (cgroup_subsys_state) will become RCU protected and there will be two stages which require punting to work item during release. To prepare for using the work item for multiple times, rename css->dput_work to css->destroy_work and css_dput_fn() to css_free_work_fn() and move work item initialization from css init to right before the actual usage. This reorganization doesn't introduce any behavior change. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 49ad96ee08e1..0b280978f097 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4259,10 +4259,10 @@ err: return ret; } -static void css_dput_fn(struct work_struct *work) +static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, dput_work); + container_of(work, struct cgroup_subsys_state, destroy_work); cgroup_dput(css->cgroup); } @@ -4272,7 +4272,14 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - schedule_work(&css->dput_work); + /* + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context, which css_put() may + * be called without. @css->destroy_work will be used to invoke + * dput() asynchronously from css_put(). + */ + INIT_WORK(&css->destroy_work, css_free_work_fn); + schedule_work(&css->destroy_work); } static void init_cgroup_css(struct cgroup_subsys_state *css, @@ -4287,14 +4294,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; - - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context, which css_put() may - * be called without. @css->dput_work will be used to invoke - * dput() asynchronously from css_put(). - */ - INIT_WORK(&css->dput_work, css_dput_fn); } /* invoke ->css_online() on a new CSS and mark it online if successful */ -- cgit From 0ae78e0bf10ac38ab53548e18383afc9997eca22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: add cgroup_subsys_state->parent With the planned unified hierarchy, css's (cgroup_subsys_state) will be RCU protected and allowed to be attached and detached dynamically over the course of a cgroup's lifetime. This means that css's will stay accessible after being detached from its cgroup - the matching pointer in cgroup->subsys[] cleared - for ref draining and RCU grace period. cgroup core still wants to guarantee that the parent css is never destroyed before its children and css_parent() always returns the parent regardless of the state of the child css as long as it's accessible. This patch makes css's hold onto their parents and adds css->parent so that the parent css is never detroyed before its children and can be determined without consulting the cgroups. cgroup->dummy_css is also updated to point to the parent dummy_css; however, it doesn't need to worry about object lifetime as the parent cgroup is already pinned by the child. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0b280978f097..5c6dd7ed26a7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4264,6 +4264,9 @@ static void css_free_work_fn(struct work_struct *work) struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); + if (css->parent) + css_put(css->parent); + cgroup_dput(css->cgroup); } @@ -4290,8 +4293,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->ss = ss; css->flags = 0; css->id = NULL; - if (cgrp == cgroup_dummy_top) + + if (cgrp->parent) + css->parent = cgroup_css(cgrp->parent, ss->subsys_id); + else css->flags |= CSS_ROOT; + BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; } @@ -4388,6 +4395,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->dentry = dentry; cgrp->parent = parent; + cgrp->dummy_css.parent = &parent->dummy_css; cgrp->root = parent->root; if (notify_on_release(parent)) @@ -4436,9 +4444,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - /* each css holds a ref to the cgroup's dentry */ - for_each_root_subsys(root, ss) + /* each css holds a ref to the cgroup's dentry and the parent css */ + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + dget(dentry); + percpu_ref_get(&css->parent->refcnt); + } /* hold a ref to the parent's dentry */ dget(parent->dentry); -- cgit From b77d7b6088377998ebf65eaea5e51008c2d75e94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: cgroup_css_from_dir() now should be called with RCU read locked cgroup->subsys[] will become RCU protected and thus all cgroup_css() usages should either be under RCU read lock or cgroup_mutex. This patch updates cgroup_css_from_dir() which returns the matching cgroup_subsys_state given a directory file and subsys_id so that it requires RCU read lock and updates its sole user perf_cgroup_connect(). Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar --- kernel/cgroup.c | 12 ++++++++++-- kernel/events/core.c | 3 +++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5c6dd7ed26a7..cbb6314f1836 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5616,8 +5616,14 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) } EXPORT_SYMBOL_GPL(css_lookup); -/* - * get corresponding css from file open on cgroupfs directory +/** + * cgroup_css_from_dir - get corresponding css from file open on cgroup dir + * @f: directory file of interest + * @id: subsystem id of interest + * + * Must be called under RCU read lock. The caller is responsible for + * pinning the returned css if it needs to be accessed outside the RCU + * critical section. */ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) { @@ -5625,6 +5631,8 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) struct inode *inode; struct cgroup_subsys_state *css; + WARN_ON_ONCE(!rcu_read_lock_held()); + inode = file_inode(f); /* check in cgroup filesystem dir */ if (inode->i_op != &cgroup_dir_inode_operations) diff --git a/kernel/events/core.c b/kernel/events/core.c index c199c4f24910..23261f957713 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -591,6 +591,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; + rcu_read_lock(); + css = cgroup_css_from_dir(f.file, perf_subsys_id); if (IS_ERR(css)) { ret = PTR_ERR(css); @@ -617,6 +619,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: + rcu_read_unlock(); fdput(f); return ret; } -- cgit From 105347ba5da3e87facce2337c50cd5df93cc6bec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: make cgroup_file_open() rcu_read_lock() around cgroup_css() and add cfent->css For the planned unified hierarchy, each css (cgroup_subsys_state) will be RCU protected so that it can be created and destroyed individually while allowing RCU accesses, and cgroup_css() will soon require either holding cgroup_mutex or RCU read lock. This patch updates cgroup_file_open() such that it acquires the associated css under rcu_read_lock(). While cgroup_file_css() usages in other file operations are safe due to the reference from open, cgroup_css() wouldn't know that and will still trigger warnings. It'd be cleanest to store the acquired css in file->prvidate_data for further file operations but that's already used by seqfile. This patch instead adds cfent->css to cache the associated css. Note that while this field is initialized during cfe init, it should only be considered valid while the file is open. This patch doesn't change visible behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cbb6314f1836..d63beffd41e1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -117,6 +117,7 @@ struct cfent { struct list_head node; struct dentry *dentry; struct cftype *type; + struct cgroup_subsys_state *css; /* file xattrs */ struct simple_xattrs xattrs; @@ -2301,17 +2302,6 @@ static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, return 0; } -/* return the css for the given cgroup file */ -static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) -{ - struct cftype *cft = cfe->type; - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); - - if (cft->ss) - return cgroup_css(cgrp, cft->ss->subsys_id); - return &cgrp->dummy_css; -} - /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 @@ -2388,7 +2378,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->write) return cft->write(css, cft, file, buf, nbytes, ppos); @@ -2430,7 +2420,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->read) return cft->read(css, cft, file, buf, nbytes, ppos); @@ -2456,7 +2446,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cfent *cfe = m->private; struct cftype *cft = cfe->type; - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->read_map) { struct cgroup_map_cb cb = { @@ -2479,7 +2469,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file) { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + struct cgroup_subsys_state *css; int err; err = generic_file_open(inode, file); @@ -2491,7 +2482,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * unpinned either on open failure or release. This ensures that * @css stays alive for all file operations. */ - if (css->ss && !css_tryget(css)) + rcu_read_lock(); + if (cft->ss) { + css = cgroup_css(cgrp, cft->ss->subsys_id); + if (!css_tryget(css)) + css = NULL; + } else { + css = &cgrp->dummy_css; + } + rcu_read_unlock(); + + /* css should match @cfe->css, see cgroup_add_file() for details */ + if (!css || WARN_ON_ONCE(css != cfe->css)) return -ENODEV; if (cft->read_map || cft->read_seq_string) { @@ -2510,7 +2512,7 @@ static int cgroup_file_release(struct inode *inode, struct file *file) { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; int ret = 0; if (cft->release) @@ -2772,6 +2774,18 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) dentry->d_fsdata = cfe; simple_xattrs_init(&cfe->xattrs); + /* + * cfe->css is used by read/write/close to determine the associated + * css. file->private_data would be a better place but that's + * already used by seqfile. Note that open will use the usual + * cgroup_css() and css_tryget() to acquire the css and this + * caching doesn't affect css lifetime management. + */ + if (cft->ss) + cfe->css = cgroup_css(cgrp, cft->ss->subsys_id); + else + cfe->css = &cgrp->dummy_css; + mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { -- cgit From 73e80ed8007fc48a6deeb295ba37159fad274bd2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: add __rcu modifier to cgroup->subsys[] For the planned unified hierarchy, each css (cgroup_subsys_state) will be RCU protected so that it can be created and destroyed individually while allowing RCU accesses. Previous changes ensured that all cgroup->subsys[] accesses use the cgroup_css() accessor. This patch adds __rcu modifier to cgroup->subsys[], add matching RCU dereference in cgroup_css() and convert all assignments to either rcu_assign_pointer() or RCU_INIT_POINTER(). This change prepares for the actual RCUfication of css's and doesn't introduce any visible behavior change. The conversion is verified with sparse and all accesses are properly RCU annotated. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d63beffd41e1..c27101622567 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -229,11 +229,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], * @subsys_id: the subsystem of interest * * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. + * This function must be called either under cgroup_mutex or + * rcu_read_lock() and the caller is responsible for pinning the returned + * css if it wants to keep accessing it outside the said locks. This + * function may return %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, int subsys_id) { - return cgrp->subsys[subsys_id]; + return rcu_dereference_check(cgrp->subsys[subsys_id], + lockdep_is_held(&cgroup_mutex)); } /* convenient tests for these bits */ @@ -1072,8 +1077,10 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(!cgroup_css(cgroup_dummy_top, i)); BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); - cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; + rcu_assign_pointer(cgrp->subsys[i], + cgroup_css(cgroup_dummy_top, i)); cgroup_css(cgrp, i)->cgroup = cgrp; + list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) @@ -1088,8 +1095,10 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (ss->bind) ss->bind(cgroup_css(cgroup_dummy_top, i)); + cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; - cgrp->subsys[i] = NULL; + RCU_INIT_POINTER(cgrp->subsys[i], NULL); + cgroup_subsys[i]->root = &cgroup_dummy_root; list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); @@ -4314,7 +4323,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); - cgrp->subsys[ss->subsys_id] = css; + rcu_assign_pointer(cgrp->subsys[ss->subsys_id], css); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4962,7 +4971,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * also takes care of freeing the css_id. */ ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); - cgroup_dummy_top->subsys[ss->subsys_id] = NULL; + RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); mutex_unlock(&cgroup_mutex); } -- cgit From 623f926b050e12b0f5e3a2f4d11c36e4ddd63541 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: reorganize css init / exit paths css (cgroup_subsys_state) lifetime management is about to be restructured. In prepartion, make the following mostly trivial changes. * init_cgroup_css() is renamed to init_css() so that it's consistent with other css handling functions. * alloc_css_id(), online_css() and offline_css() updated to take @css instead of cgroups and subsys IDs. This patch doesn't make any functional changes. v2: v1 merged two for_each_root_subsys() loops in cgroup_create() but Li Zefan pointed out that it breaks error path. Dropped. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c27101622567..a1ebc445f350 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -838,8 +838,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static int alloc_css_id(struct cgroup_subsys *ss, - struct cgroup *parent, struct cgroup *child); +static int alloc_css_id(struct cgroup_subsys_state *child_css); static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { @@ -4308,9 +4307,8 @@ static void css_release(struct percpu_ref *ref) schedule_work(&css->destroy_work); } -static void init_cgroup_css(struct cgroup_subsys_state *css, - struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, + struct cgroup *cgrp) { css->cgroup = cgrp; css->ss = ss; @@ -4327,9 +4325,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, } /* invoke ->css_online() on a new CSS and mark it online if successful */ -static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +static int online_css(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys *ss = css->ss; int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -4342,9 +4340,9 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) } /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ -static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void offline_css(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys *ss = css->ss; lockdep_assert_held(&cgroup_mutex); @@ -4442,10 +4440,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; } - init_cgroup_css(css, ss, cgrp); + init_css(css, ss, cgrp); if (ss->use_id) { - err = alloc_css_id(ss, parent, cgrp); + err = alloc_css_id(css); if (err) goto err_free_all; } @@ -4480,7 +4478,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* creation succeeded, notify subsystems */ for_each_root_subsys(root, ss) { - err = online_css(ss, cgrp); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + + err = online_css(css); if (err) goto err_destroy; @@ -4700,7 +4700,7 @@ static void cgroup_offline_fn(struct work_struct *work) * initate destruction. */ for_each_root_subsys(cgrp->root, ss) - offline_css(ss, cgrp); + offline_css(cgroup_css(cgrp, ss->subsys_id)); /* * Put the css refs from cgroup_destroy_locked(). Each css holds @@ -4778,7 +4778,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_cgroup_css(css, ss, cgroup_dummy_top); + init_css(css, ss, cgroup_dummy_top); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4793,7 +4793,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - BUG_ON(online_css(ss, cgroup_dummy_top)); + BUG_ON(online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id))); mutex_unlock(&cgroup_mutex); @@ -4866,8 +4866,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) ss->root = &cgroup_dummy_root; /* our new subsystem will be attached to the dummy hierarchy. */ - init_cgroup_css(css, ss, cgroup_dummy_top); - /* init_idr must be after init_cgroup_css because it sets css->id. */ + init_css(css, ss, cgroup_dummy_top); + /* init_idr must be after init_css() because it sets css->id. */ if (ss->use_id) { ret = cgroup_init_idr(ss, css); if (ret) @@ -4897,7 +4897,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ret = online_css(ss, cgroup_dummy_top); + ret = online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (ret) goto err_unload; @@ -4936,7 +4936,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); - offline_css(ss, cgroup_dummy_top); + offline_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (ss->use_id) idr_destroy(&ss->idr); @@ -5588,20 +5588,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, return 0; } -static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, - struct cgroup *child) +static int alloc_css_id(struct cgroup_subsys_state *child_css) { - int subsys_id, i, depth = 0; - struct cgroup_subsys_state *parent_css, *child_css; + struct cgroup_subsys_state *parent_css = css_parent(child_css); struct css_id *child_id, *parent_id; + int i, depth; - subsys_id = ss->subsys_id; - parent_css = cgroup_css(parent, subsys_id); - child_css = cgroup_css(child, subsys_id); parent_id = rcu_dereference_protected(parent_css->id, true); depth = parent_id->depth + 1; - child_id = get_new_cssid(ss, depth); + child_id = get_new_cssid(child_css->ss, depth); if (IS_ERR(child_id)) return PTR_ERR(child_id); -- cgit From ae7f164a09408bf21ab3c82a9e80a3ff37aa9e3e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: move cgroup->subsys[] assignment to online_css() Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. With the planned unified hierarchy, css's will be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, css's will be individually RCU protected instead of being tied to the cgroup. In preparation, this patch moves cgroup->subsys[] assignment from init_css() to online_css(). As this means that a newly initialized css should be remembered separately and that cgroup_css() returns NULL between init and online, cgroup_create() is updated so that it stores newly created css's in a local array css_ar[] and cgroup_init/load_subsys() are updated to use local variable @css instead of using cgroup_css(). This change also slightly simplifies error path of cgroup_create(). While this patch changes when cgroup->subsys[] is initialized, this change isn't visible to subsystems or userland. v2: This patch wasn't updated accordingly after the previous "cgroup: reorganize css init / exit paths" was updated leading to missing a css_ar[] conversion in cgroup_create() and thus boot failure. Fix it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a1ebc445f350..b9f736c3b36d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4321,7 +4321,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); - rcu_assign_pointer(cgrp->subsys[ss->subsys_id], css); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4334,8 +4333,10 @@ static int online_css(struct cgroup_subsys_state *css) if (ss->css_online) ret = ss->css_online(css); - if (!ret) + if (!ret) { css->flags |= CSS_ONLINE; + rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + } return ret; } @@ -4366,6 +4367,7 @@ static void offline_css(struct cgroup_subsys_state *css) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { + struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4433,12 +4435,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = PTR_ERR(css); goto err_free_all; } + css_ar[ss->subsys_id] = css; err = percpu_ref_init(&css->refcnt, css_release); - if (err) { - ss->css_free(css); + if (err) goto err_free_all; - } init_css(css, ss, cgrp); @@ -4467,7 +4468,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* each css holds a ref to the cgroup's dentry and the parent css */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; dget(dentry); percpu_ref_get(&css->parent->refcnt); @@ -4478,7 +4479,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* creation succeeded, notify subsystems */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; err = online_css(css); if (err) @@ -4511,7 +4512,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; if (css) { percpu_ref_cancel_init(&css->refcnt); @@ -4793,7 +4794,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - BUG_ON(online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id))); + BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); @@ -4897,7 +4898,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ret = online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + ret = online_css(css); if (ret) goto err_unload; -- cgit From 223dbc38d2a8745a93749dc75ed909e274ce075d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: bounce cgroup_subsys_state ref kill confirmation to a work item css (cgroup_subsys_state) offlining, which requires process context, will be moved to ref kill confirmation. In preparation, bounce css_killed handling through css->destroy_work. css_ref_killed_fn() is renamed to css_killed_ref_fn() so that it's consistent with the new css_killed_work_fn(). This patch adds an additional work item bouncing but doesn't change the actual logic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9f736c3b36d..398ffbbee32f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4555,12 +4555,27 @@ static void cgroup_css_killed(struct cgroup *cgrp) schedule_work(&cgrp->destroy_work); } -static void css_ref_killed_fn(struct percpu_ref *ref) +/* + * This is called when the refcnt of a css is confirmed to be killed. + * css_tryget() is now guaranteed to fail. + */ +static void css_killed_work_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; + + cgroup_css_killed(cgrp); +} + +/* css kill confirmation processing requires process context, bounce */ +static void css_killed_ref_fn(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - cgroup_css_killed(css->cgroup); + INIT_WORK(&css->destroy_work, css_killed_work_fn); + schedule_work(&css->destroy_work); } /** @@ -4634,7 +4649,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) percpu_ref_get(&css->refcnt); atomic_inc(&cgrp->css_kill_cnt); - percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } cgroup_css_killed(cgrp); -- cgit From f20104de55a212a9742d8df1807f1f29dc95b748 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: replace cgroup->css_kill_cnt with ->nr_css Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. With the planned unified hierarchy, css's will be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, css's will be individually RCU protected instead of being tied to the cgroup. cgroup->css_kill_cnt is used during cgroup destruction to wait for css reference count disable; however, this model doesn't work once css's lifetimes are managed separately from cgroup's. This patch replaces it with cgroup->nr_css which is an cgroup_mutex protected integer counting the number of attached css's. The count is incremented from online_css() and decremented after refcnt kill is confirmed. If the count reaches zero and the cgroup is marked dead, the second stage of cgroup destruction is kicked off. If a cgroup doesn't have any css attached at the time of rmdir, cgroup_destroy_locked() now invokes the second stage directly as no css kill confirmation would happen. cgroup_offline_fn() - the second step of cgroup destruction - is renamed to cgroup_destroy_css_killed() and now expects to be called with cgroup_mutex held. While this patch changes how css destruction is punted to work items, it shouldn't change any visible behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 52 ++++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 398ffbbee32f..174f4c3d72ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -218,7 +218,7 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; -static void cgroup_offline_fn(struct work_struct *work); +static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -4335,6 +4335,7 @@ static int online_css(struct cgroup_subsys_state *css) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; + css->cgroup->nr_css++; rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); } return ret; @@ -4545,16 +4546,6 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -static void cgroup_css_killed(struct cgroup *cgrp) -{ - if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) - return; - - /* percpu ref's of all css's are killed, kick off the next step */ - INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); - schedule_work(&cgrp->destroy_work); -} - /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget() is now guaranteed to fail. @@ -4565,7 +4556,17 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - cgroup_css_killed(cgrp); + mutex_lock(&cgroup_mutex); + + /* + * If @cgrp is marked dead, it's waiting for refs of all css's to + * be disabled before proceeding to the second phase of cgroup + * destruction. If we are the last one, kick it off. + */ + if (!--cgrp->nr_css && cgroup_is_dead(cgrp)) + cgroup_destroy_css_killed(cgrp); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -4634,11 +4635,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * Use percpu_ref_kill_and_confirm() to get notifications as each * css is confirmed to be seen as killed on all CPUs. The * notification callback keeps track of the number of css's to be - * killed and schedules cgroup_offline_fn() to perform the rest of - * destruction once the percpu refs of all css's are confirmed to - * be killed. + * killed and invokes cgroup_destroy_css_killed() to perform the + * rest of destruction once the percpu refs of all css's are + * confirmed to be killed. */ - atomic_set(&cgrp->css_kill_cnt, 1); for_each_root_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); @@ -4648,10 +4648,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ percpu_ref_get(&css->refcnt); - atomic_inc(&cgrp->css_kill_cnt); percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } - cgroup_css_killed(cgrp); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4668,6 +4666,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); + /* + * If @cgrp has css's attached, the second stage of cgroup + * destruction is kicked off from css_killed_work_fn() after the + * refs of all attached css's are killed. If @cgrp doesn't have + * any css, we kick it off here. + */ + if (!cgrp->nr_css) + cgroup_destroy_css_killed(cgrp); + /* * Clear and remove @cgrp directory. The removal puts the base ref * but we aren't quite done with @cgrp yet, so hold onto it. @@ -4693,7 +4700,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) }; /** - * cgroup_offline_fn - the second step of cgroup destruction + * cgroup_destroy_css_killed - the second step of cgroup destruction * @work: cgroup->destroy_free_work * * This function is invoked from a work item for a cgroup which is being @@ -4702,14 +4709,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * is the second step of destruction described in the comment above * cgroup_destroy_locked(). */ -static void cgroup_offline_fn(struct work_struct *work) +static void cgroup_destroy_css_killed(struct cgroup *cgrp) { - struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); struct cgroup *parent = cgrp->parent; struct dentry *d = cgrp->dentry; struct cgroup_subsys *ss; - mutex_lock(&cgroup_mutex); + lockdep_assert_held(&cgroup_mutex); /* * css_tryget() is guaranteed to fail now. Tell subsystems to @@ -4743,8 +4749,6 @@ static void cgroup_offline_fn(struct work_struct *work) set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); - - mutex_unlock(&cgroup_mutex); } static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) -- cgit From 09a503ea3a816b285b0b402b7f785eaec0c7a7e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: decouple cgroup_subsys_state destruction from cgroup destruction Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. css's are created when the associated cgroup is created and destroyed when it gets destroyed. Also, individual css's aren't RCU protected but the whole cgroup is. With the planned unified hierarchy, css's will need to be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, this patch decouples css destruction from cgroup destruction - offline_css() invocation and the final css_put() are moved from cgroup_destroy_css_killed() to css_killed_work_fn(). Now each css is individually offlined and put as its reference count is killed instead of waiting for all css's attached to the cgroup to finish refcnt killing and then proceeding to offlining and putting them together. While this changes the order of destruction operations, the changes shouldn't be noticeable to cgroup subsystems or userland. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 52 ++++++++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 174f4c3d72ef..3c4c4b01ffe5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4355,6 +4355,7 @@ static void offline_css(struct cgroup_subsys_state *css) ss->css_offline(css); css->flags &= ~CSS_ONLINE; + css->cgroup->nr_css--; } /* @@ -4558,15 +4559,30 @@ static void css_killed_work_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); + /* + * css_tryget() is guaranteed to fail now. Tell subsystems to + * initate destruction. + */ + offline_css(css); + /* * If @cgrp is marked dead, it's waiting for refs of all css's to * be disabled before proceeding to the second phase of cgroup * destruction. If we are the last one, kick it off. */ - if (!--cgrp->nr_css && cgroup_is_dead(cgrp)) + if (!cgrp->nr_css && cgroup_is_dead(cgrp)) cgroup_destroy_css_killed(cgrp); mutex_unlock(&cgroup_mutex); + + /* + * Put the css refs from kill_css(). Each css holds an extra + * reference to the cgroup's dentry and cgroup removal proceeds + * regardless of css refs. On the last put of each css, whenever + * that may be, the extra dentry ref is put so that dentry + * destruction happens only after all css's are released. + */ + css_put(css); } /* css kill confirmation processing requires process context, bounce */ @@ -4633,11 +4649,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * as killed on all CPUs on return. * * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. The - * notification callback keeps track of the number of css's to be - * killed and invokes cgroup_destroy_css_killed() to perform the - * rest of destruction once the percpu refs of all css's are - * confirmed to be killed. + * css is confirmed to be seen as killed on all CPUs. + * cgroup_destroy_css_killed() will be invoked to perform the rest + * of destruction once the percpu refs of all css's are confirmed + * to be killed. */ for_each_root_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); @@ -4704,36 +4719,17 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * @work: cgroup->destroy_free_work * * This function is invoked from a work item for a cgroup which is being - * destroyed after the percpu refcnts of all css's are guaranteed to be - * seen as killed on all CPUs, and performs the rest of destruction. This - * is the second step of destruction described in the comment above - * cgroup_destroy_locked(). + * destroyed after all css's are offlined and performs the rest of + * destruction. This is the second step of destruction described in the + * comment above cgroup_destroy_locked(). */ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; struct dentry *d = cgrp->dentry; - struct cgroup_subsys *ss; lockdep_assert_held(&cgroup_mutex); - /* - * css_tryget() is guaranteed to fail now. Tell subsystems to - * initate destruction. - */ - for_each_root_subsys(cgrp->root, ss) - offline_css(cgroup_css(cgrp, ss->subsys_id)); - - /* - * Put the css refs from cgroup_destroy_locked(). Each css holds - * an extra reference to the cgroup's dentry and cgroup removal - * proceeds regardless of css refs. On the last put of each css, - * whenever that may be, the extra dentry ref is put so that dentry - * destruction happens only after all css's are released. - */ - for_each_root_subsys(cgrp->root, ss) - css_put(cgroup_css(cgrp, ss->subsys_id)); - /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); -- cgit From edae0c3358947f8be5ca99f762d89e0c38e1f5d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: factor out kill_css() Factor out css ref killing from cgroup_destroy_locked() into kill_css(). We're gonna add more to the path and the factored out function will eventually be called from other places too. While at it, replace open coded percpu_ref_get() with css_get() for consistency. This shouldn't cause any functional difference as the function is not used for root cgroups. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 58 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3c4c4b01ffe5..7b7575f3119c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4595,6 +4595,36 @@ static void css_killed_ref_fn(struct percpu_ref *ref) schedule_work(&css->destroy_work); } +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by putting its base + * reference. ->css_offline() will be invoked asynchronously once + * css_tryget() is guaranteed to fail and when the reference count reaches + * zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) +{ + /* + * Killing would put the base ref, but we need to keep it alive + * until after ->css_offline(). + */ + css_get(css); + + /* + * cgroup core guarantees that, by the time ->css_offline() is + * invoked, no new css reference will be given out via + * css_tryget(). We can't simply call percpu_ref_kill() and + * proceed to offlining css's because percpu_ref_kill() doesn't + * guarantee that the ref is seen as killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each + * css is confirmed to be seen as killed on all CPUs. + */ + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); +} + /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed @@ -4641,30 +4671,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Block new css_tryget() by killing css refcnts. cgroup core - * guarantees that, by the time ->css_offline() is invoked, no new - * css reference will be given out via css_tryget(). We can't - * simply call percpu_ref_kill() and proceed to offlining css's - * because percpu_ref_kill() doesn't guarantee that the ref is seen - * as killed on all CPUs on return. - * - * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. - * cgroup_destroy_css_killed() will be invoked to perform the rest - * of destruction once the percpu refs of all css's are confirmed - * to be killed. + * Initiate massacre of all css's. cgroup_destroy_css_killed() + * will be invoked to perform the rest of destruction once the + * percpu refs of all css's are confirmed to be killed. */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); - - /* - * Killing would put the base ref, but we need to keep it - * alive until after ->css_offline. - */ - percpu_ref_get(&css->refcnt); - - percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); - } + for_each_root_subsys(cgrp->root, ss) + kill_css(cgroup_css(cgrp, ss->subsys_id)); /* * Mark @cgrp dead. This prevents further task migration and child -- cgit From 3c14f8b44fafaa60519440bea1591e495b928327 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: move subsys file removal to kill_css() With the planned unified hierarchy, individual css's will be created and destroyed dynamically across the lifetime of a cgroup. To enable such usages, css destruction is being decoupled from cgroup destruction. This patch moves subsys file removal from cgroup_destroy_locked() to kill_css(). While this changes the order of destruction operations, the changes shouldn't be noticeable to cgroup subsystems or userland. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b7575f3119c..3137e38995b0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4599,13 +4599,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref) * kill_css - destroy a css * @css: css to destroy * - * This function initiates destruction of @css by putting its base - * reference. ->css_offline() will be invoked asynchronously once - * css_tryget() is guaranteed to fail and when the reference count reaches - * zero, @css will be released. + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { + cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + /* * Killing would put the base ref, but we need to keep it alive * until after ->css_offline(). @@ -4703,10 +4705,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup_destroy_css_killed(cgrp); /* - * Clear and remove @cgrp directory. The removal puts the base ref - * but we aren't quite done with @cgrp yet, so hold onto it. + * Clear the base files and remove @cgrp directory. The removal + * puts the base ref but we aren't quite done with @cgrp yet, so + * hold onto it. */ - cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); cgroup_addrm_files(cgrp, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); -- cgit From 0c21ead136a900c36f1ab74fd7d09a306dc31324 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: RCU protect each cgroup_subsys_state release With the planned unified hierarchy, individual css's will be created and destroyed dynamically across the lifetime of a cgroup. To enable such usages, css destruction is being decoupled from cgroup destruction. Most of the destruction path has been decoupled but the actual free of css still depends on cgroup free path. When all css refs are drained, css_release() kicks off css_free_work_fn() which puts the cgroup. When the cgroup refcnt reaches zero, cgroup_diput() is invoked which in turn schedules RCU free of the cgroup. After a grace period, all css's are freed along with the cgroup itself. This patch moves the RCU grace period and css freeing from cgroup release path to css release path. css_release(), instead of kicking off css_free_work_fn() directly, schedules RCU callback css_free_rcu_fn() which in turn kicks off css_free_work_fn() after a RCU grace period. css_free_work_fn() is updated to free the css directly. The five-way punting - percpu ref kill confirmation, a work item, percpu ref release, RCU grace period, and again a work item - is quite hairy but the work items are there only to provide process context and the actual sequence is kill confirm -> release -> RCU free, which isn't simple but not too crazy. This removes cgroup_css() usage after offline_css() allowing clearing cgroup->subsys[] from offline_css(), which makes it consistent with online_css() and brings it closer to proper lifetime management for individual css's. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 53 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3137e38995b0..66d01078eebe 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -869,18 +869,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - struct cgroup_subsys *ss; mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); - - ss->css_free(css); - } - cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -4281,32 +4271,62 @@ err: return ret; } +/* + * css destruction is four-stage process. + * + * 1. Destruction starts. Killing of the percpu_ref is initiated. + * Implemented in kill_css(). + * + * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs + * and thus css_tryget() is guaranteed to fail, the css can be offlined + * by invoking offline_css(). After offlining, the base ref is put. + * Implemented in css_killed_work_fn(). + * + * 3. When the percpu_ref reaches zero, the only possible remaining + * accessors are inside RCU read sections. css_release() schedules the + * RCU callback. + * + * 4. After the grace period, the css can be freed. Implemented in + * css_free_work_fn(). + * + * It is actually hairier because both step 2 and 4 require process context + * and thus involve punting to css->destroy_work adding two additional + * steps to the already complex sequence. + */ static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; if (css->parent) css_put(css->parent); - cgroup_dput(css->cgroup); + css->ss->css_free(css); + cgroup_dput(cgrp); } -static void css_release(struct percpu_ref *ref) +static void css_free_rcu_fn(struct rcu_head *rcu_head) { struct cgroup_subsys_state *css = - container_of(ref, struct cgroup_subsys_state, refcnt); + container_of(rcu_head, struct cgroup_subsys_state, rcu_head); /* * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context, which css_put() may - * be called without. @css->destroy_work will be used to invoke - * dput() asynchronously from css_put(). + * css_put(). dput() requires process context which we don't have. */ INIT_WORK(&css->destroy_work, css_free_work_fn); schedule_work(&css->destroy_work); } +static void css_release(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + call_rcu(&css->rcu_head, css_free_rcu_fn); +} + static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { @@ -4356,6 +4376,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; + RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); } /* -- cgit From ff58ac0d58d51bffe868b239ed8fce7c4a23c5a9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 13 Aug 2013 09:17:33 +0800 Subject: cpuset: remove an unncessary forward declaration Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 72a0383f382f..95f4b25e1538 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -68,9 +68,6 @@ */ int number_of_cpusets __read_mostly; -/* Forward declare cgroup structures */ -struct cgroup_subsys cpuset_subsys; - /* See "Frequency meter" comments, below. */ struct fmeter { -- cgit From 930913a31289202d232186b82854b26d7fb7cf4d Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 16 Aug 2013 17:57:14 +0800 Subject: cgroup: use css_get() in cgroup_create() to check CSS_ROOT It seems that the root css doesn't have refcnt allocated(not needed?), and would cause the booting error attached. This patch tries to use css_get() to not increase the refcnt if parent is root. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] cgroup_mkdir+0x37c/0x740 PGD 0 Oops: 0002 [#1] Modules linked in: CPU: 0 PID: 1 Comm: systemd Not tainted 3.11.0-rc5-next-20130815+ #1 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 task: ffff88007f868000 ti: ffff88007f864000 task.ti: ffff88007f864000 RIP: 0010:[] [] cgroup_mkdir+0x37c/0x740 RSP: 0018:ffff88007f865df8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffffff81a46ee0 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff81a415c0 RBP: ffff88007f865ec8 R08: 0000000000000001 R09: 0000000000000000 R10: ffff88007ce6d060 R11: 0000000000000000 R12: ffff88007ce6d000 R13: ffff88007ce6d060 R14: ffffffff81a46d80 R15: ffff88007c6e8018 FS: 00007f13dbf6f840(0000) GS:ffffffff81a23000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007b7e5000 CR4: 00000000000006b0 Stack: ffffffff810b380d 0000000000000002 ffff88007f865e18 ffffffff81167069 ffff88007f865ed8 ffffffff8116a3f5 ffff880037454400 ffff88007c6e8018 ffff88007c6e8028 ffff88007c6e8328 ffff88007c6e8000 ffff88007ce6d000 Call Trace: [] ? cgroup_mkdir+0x3bd/0x740 [] ? lookup_hash+0x19/0x20 [] ? kern_path_create+0x95/0x170 [] vfs_mkdir+0x9e/0xf0 [] SyS_mkdirat+0x60/0xe0 [] SyS_mkdir+0x19/0x20 [] tracesys+0xcf/0xd4 Code: ad 70 ff ff ff 48 89 9d 60 ff ff ff 4d 89 d5 4c 8b bd 68 ff ff ff 4c 8b 65 88 eb 50 0f 1f 00 48 8b 43 18 a8 03 0f 85 6c 03 00 00 00 e8 1d 0a fb ff 85 c0 74 0d 80 3d f0 45 a1 00 00 0f 84 4c RIP [] cgroup_mkdir+0x37c/0x740 RSP CR2: 0000000000000000 ---[ end trace a4b14b49bc46fd60 ]--- Signed-off-by: Li Zhong Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66d01078eebe..b69b572131e5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4494,7 +4494,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; dget(dentry); - percpu_ref_get(&css->parent->refcnt); + css_get(css->parent); } /* hold a ref to the parent's dentry */ -- cgit From 1cb650b91ba582f6737457b7d22e368585596d2c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 19 Aug 2013 10:05:24 +0800 Subject: cgroup: change cgroup_from_id() to css_from_id() Now we want cgroup core to always provide the css to use to the subsystems, so change this API to css_from_id(). Uninline css_from_id(), because it's getting bigger and cgroup_css() has been unexported. While at it, remove the #ifdef, and shuffle the order of the args. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b69b572131e5..ff7d642a070a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5717,6 +5717,28 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) return css ? css : ERR_PTR(-ENOENT); } +/** + * css_from_id - lookup css by id + * @id: the cgroup id + * @ss: cgroup subsys to be looked into + * + * Returns the css if there's valid one with @id, otherwise returns NULL. + * Should be called under rcu_read_lock(). + */ +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) +{ + struct cgroup *cgrp; + + rcu_lockdep_assert(rcu_read_lock_held() || + lockdep_is_held(&cgroup_mutex), + "css_from_id() needs proper protection"); + + cgrp = idr_find(&ss->root->cgroup_idr, id); + if (cgrp) + return cgroup_css(cgrp, ss->subsys_id); + return NULL; +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) -- cgit From 0bfb4aa67cef4982adc70590a31624d7b35a0bda Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Aug 2013 11:42:36 -0400 Subject: cgroup: fix subsystem file accesses on the root cgroup 105347ba5 ("cgroup: make cgroup_file_open() rcu_read_lock() around cgroup_css() and add cfent->css") added cfent->css to cache the associted cgroup_subsys_state across file operations. A cfent is associated with single css throughout its lifetime and the origimal commit initialized the cache pointer during cgroup_add_file() and verified that it matches the actual one in cgroup_file_open(). While this works fine for !root cgroups, it's broken for root cgroups as files in a root cgroup are created before the css's are associated with the cgroup and thus cgroup_css() call in cgroup_add_file() returns NULL associating all cfents in the root cgroup with NULL css. This makes cgroup_file_open() trigger WARN and fail with -ENODEV for all !core subsystem files in the root cgroups. There's no reason to initialize cfent->css separately from cgroup_add_file(). As the association never changes, cgroup_file_open() can set it unconditionally every time and containing the logic in cgroup_file_open() makes more sense anyway as the only reason it's necessary is file->private_data being already occupied. Fix it by setting cfent->css unconditionally from cgroup_file_open(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ff7d642a070a..896e035eb6e4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2490,10 +2490,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file) } rcu_read_unlock(); - /* css should match @cfe->css, see cgroup_add_file() for details */ - if (!css || WARN_ON_ONCE(css != cfe->css)) + if (!css) return -ENODEV; + /* + * @cfe->css is used by read/write/close to determine the + * associated css. @file->private_data would be a better place but + * that's already used by seqfile. Multiple accessors may use it + * simultaneously which is okay as the association never changes. + */ + WARN_ON_ONCE(cfe->css && cfe->css != css); + cfe->css = css; + if (cft->read_map || cft->read_seq_string) { file->f_op = &cgroup_seqfile_operations; err = single_open(file, cgroup_seqfile_show, cfe); @@ -2772,18 +2780,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) dentry->d_fsdata = cfe; simple_xattrs_init(&cfe->xattrs); - /* - * cfe->css is used by read/write/close to determine the associated - * css. file->private_data would be a better place but that's - * already used by seqfile. Note that open will use the usual - * cgroup_css() and css_tryget() to acquire the css and this - * caching doesn't affect css lifetime management. - */ - if (cft->ss) - cfe->css = cgroup_css(cgrp, cft->ss->subsys_id); - else - cfe->css = &cgrp->dummy_css; - mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { -- cgit From 6e6eab0efdf48fb2d8d7aee904d7740acb4661c6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Aug 2013 11:43:15 -0400 Subject: cgroup: fix cgroup_write_event_control() 81eeaf0411 ("cgroup: make cftype->[un]register_event() deal with cgroup_subsys_state inst ead of cgroup") updated the cftype event methods to take @css (cgroup_subsys_state) instead of @cgroup; however, it incorrectly used @css passed to cgroup_write_event_control(), which the dummy_css for the cgroup as the file is a cgroup core file. This leads to oops on event registration. Fix it by using the css matching the event target file. Note that cgroup_write_event_control() now disallows cgroup core files from being event sources. This is for simplicity and doesn't matter as cgroup_event will be moved and made specific to memcg. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 896e035eb6e4..ef43e3f453ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4040,10 +4040,10 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup_subsys_state *css, +static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, struct cftype *cft, const char *buffer) { - struct cgroup *cgrp = css->cgroup; + struct cgroup *cgrp = dummy_css->cgroup; struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; @@ -4065,7 +4065,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; - event->css = css; + INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -4101,6 +4101,23 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } + if (!event->cft->ss) { + ret = -EBADF; + goto out_put_cfile; + } + + /* determine the css of @cfile and associate @event with it */ + rcu_read_lock(); + + ret = -EINVAL; + event->css = cgroup_css(cgrp, event->cft->ss->subsys_id); + if (event->css) + ret = 0; + + rcu_read_unlock(); + if (ret) + goto out_put_cfile; + /* * The file to be monitored must be in the same cgroup as * cgroup.event_control is. @@ -4116,7 +4133,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } - ret = event->cft->register_event(css, event->cft, + ret = event->cft->register_event(event->css, event->cft, event->eventfd, buffer); if (ret) goto out_put_cfile; -- cgit From 35cf083619da5677f83e9a8eae813f0b413d7082 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: rename cgroup_css_from_dir() to css_from_dir() and update its syntax cgroup_css_from_dir() will grow another user. In preparation, make the following changes. * All css functions are prefixed with just "css_", rename it to css_from_dir(). * Take dentry * instead of file * as dentry is what ultimately identifies a cgroup and file may not always be available. Note that the function now checkes whether @dentry->d_inode is NULL as the caller now may specify a negative dentry. * Make it take cgroup_subsys * instead of integer subsys_id. This simplifies the function and allows specifying no subsystem for cgroup->dummy_css. * Make return section a bit less verbose. This patch doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar --- kernel/cgroup.c | 26 ++++++++++---------------- kernel/events/core.c | 2 +- 2 files changed, 11 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ef43e3f453ef..921b1387c944 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5700,34 +5700,28 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) EXPORT_SYMBOL_GPL(css_lookup); /** - * cgroup_css_from_dir - get corresponding css from file open on cgroup dir - * @f: directory file of interest - * @id: subsystem id of interest + * css_from_dir - get corresponding css from the dentry of a cgroup dir + * @dentry: directory dentry of interest + * @ss: subsystem of interest * * Must be called under RCU read lock. The caller is responsible for * pinning the returned css if it needs to be accessed outside the RCU * critical section. */ -struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct cgroup *cgrp; - struct inode *inode; - struct cgroup_subsys_state *css; WARN_ON_ONCE(!rcu_read_lock_held()); - inode = file_inode(f); - /* check in cgroup filesystem dir */ - if (inode->i_op != &cgroup_dir_inode_operations) + /* is @dentry a cgroup dir? */ + if (!dentry->d_inode || + dentry->d_inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); - if (id < 0 || id >= CGROUP_SUBSYS_COUNT) - return ERR_PTR(-EINVAL); - - /* get cgroup */ - cgrp = __d_cgrp(f->f_dentry); - css = cgroup_css(cgrp, id); - return css ? css : ERR_PTR(-ENOENT); + cgrp = __d_cgrp(dentry); + return cgroup_css(cgrp, ss->subsys_id) ?: ERR_PTR(-ENOENT); } /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 23261f957713..b59ab6632f30 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -593,7 +593,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, rcu_read_lock(); - css = cgroup_css_from_dir(f.file, perf_subsys_id); + css = css_from_dir(f.file->f_dentry, &perf_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; -- cgit From ca8bdcaff0d77990fb69e0f946018c96a70851cc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_css() take cgroup_subsys * instead and allow NULL subsys cgroup_css() is no longer used in hot paths. Make it take struct cgroup_subsys * and allow the users to specify NULL subsys to obtain the dummy_css. This removes open-coded NULL subsystem testing in a couple users and generally simplifies the code. After this patch, css_from_dir() also allows NULL @ss and returns the matching dummy_css. This behavior change doesn't affect its only user - perf. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- kernel/cgroup.c | 90 +++++++++++++++++++++++++++------------------------------ 1 file changed, 43 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 921b1387c944..7516668d8325 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -226,19 +226,22 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest - * @subsys_id: the subsystem of interest + * @ss: the subsystem of interest (%NULL returns the dummy_css) * - * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. - * This function must be called either under cgroup_mutex or - * rcu_read_lock() and the caller is responsible for pinning the returned - * css if it wants to keep accessing it outside the said locks. This - * function may return %NULL if @cgrp doesn't have @subsys_id enabled. + * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This + * function must be called either under cgroup_mutex or rcu_read_lock() and + * the caller is responsible for pinning the returned css if it wants to + * keep accessing it outside the said locks. This function may return + * %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - int subsys_id) + struct cgroup_subsys *ss) { - return rcu_dereference_check(cgrp->subsys[subsys_id], - lockdep_is_held(&cgroup_mutex)); + if (ss) + return rcu_dereference_check(cgrp->subsys[ss->subsys_id], + lockdep_is_held(&cgroup_mutex)); + else + return &cgrp->dummy_css; } /* convenient tests for these bits */ @@ -580,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ - template[i] = cgroup_css(cgrp, i); + template[i] = cgroup_css(cgrp, ss); } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ @@ -1062,30 +1065,30 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgroup_css(cgrp, i)); - BUG_ON(!cgroup_css(cgroup_dummy_top, i)); - BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); + BUG_ON(cgroup_css(cgrp, ss)); + BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); rcu_assign_pointer(cgrp->subsys[i], - cgroup_css(cgroup_dummy_top, i)); - cgroup_css(cgrp, i)->cgroup = cgrp; + cgroup_css(cgroup_dummy_top, ss)); + cgroup_css(cgrp, ss)->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgroup_css(cgrp, i)); + ss->bind(cgroup_css(cgrp, ss)); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ - BUG_ON(cgroup_css(cgrp, i) != cgroup_css(cgroup_dummy_top, i)); - BUG_ON(cgroup_css(cgrp, i)->cgroup != cgrp); + BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_css(cgroup_dummy_top, i)); + ss->bind(cgroup_css(cgroup_dummy_top, ss)); - cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; + cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; RCU_INIT_POINTER(cgrp->subsys[i], NULL); cgroup_subsys[i]->root = &cgroup_dummy_root; @@ -1930,7 +1933,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_next); struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, int subsys_id) { - return cgroup_css(tset->cur_cgrp, subsys_id); + return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); } EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); @@ -2071,7 +2074,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss->can_attach) { retval = ss->can_attach(css, &tset); @@ -2113,7 +2116,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss->attach) ss->attach(css, &tset); @@ -2135,7 +2138,7 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss == failed_ss) break; @@ -2481,13 +2484,9 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * @css stays alive for all file operations. */ rcu_read_lock(); - if (cft->ss) { - css = cgroup_css(cgrp, cft->ss->subsys_id); - if (!css_tryget(css)) - css = NULL; - } else { - css = &cgrp->dummy_css; - } + css = cgroup_css(cgrp, cft->ss); + if (cft->ss && !css_tryget(css)) + css = NULL; rcu_read_unlock(); if (!css) @@ -2878,7 +2877,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) /* add/rm files for all cgroups created before */ rcu_read_lock(); - css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { + css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (cgroup_is_dead(cgrp)) @@ -3082,10 +3081,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, if (&next->sibling == &cgrp->children) return NULL; - if (parent_css->ss) - return cgroup_css(next, parent_css->ss->subsys_id); - else - return &next->dummy_css; + return cgroup_css(next, parent_css->ss); } EXPORT_SYMBOL_GPL(css_next_child); @@ -4110,7 +4106,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, rcu_read_lock(); ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss->subsys_id); + event->css = cgroup_css(cgrp, event->cft->ss); if (event->css) ret = 0; @@ -4266,7 +4262,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* This cgroup is ready now */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); struct css_id *id = rcu_dereference_protected(css->id, true); /* @@ -4349,11 +4345,11 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->id = NULL; if (cgrp->parent) - css->parent = cgroup_css(cgrp->parent, ss->subsys_id); + css->parent = cgroup_css(cgrp->parent, ss); else css->flags |= CSS_ROOT; - BUG_ON(cgroup_css(cgrp, ss->subsys_id)); + BUG_ON(cgroup_css(cgrp, ss)); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4466,7 +4462,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(cgroup_css(parent, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(parent, ss)); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4712,7 +4708,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * percpu refs of all css's are confirmed to be killed. */ for_each_root_subsys(cgrp->root, ss) - kill_css(cgroup_css(cgrp, ss->subsys_id)); + kill_css(cgroup_css(cgrp, ss)); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4839,7 +4835,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_css(css, ss, cgroup_dummy_top); @@ -4918,7 +4914,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -5000,7 +4996,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); - offline_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + offline_css(cgroup_css(cgroup_dummy_top, ss)); if (ss->use_id) idr_destroy(&ss->idr); @@ -5034,7 +5030,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + ss->css_free(cgroup_css(cgroup_dummy_top, ss)); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); mutex_unlock(&cgroup_mutex); @@ -5721,7 +5717,7 @@ struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, return ERR_PTR(-EBADF); cgrp = __d_cgrp(dentry); - return cgroup_css(cgrp, ss->subsys_id) ?: ERR_PTR(-ENOENT); + return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); } /** -- cgit From 9fa4db334c7d9570aec7a5121e84fae99aae1d04 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: implement CFTYPE_NO_PREFIX When cgroup files are created, cgroup core automatically prepends the name of the subsystem as prefix. This patch adds CFTYPE_NO_ which disables the automatic prefix. This is to work around historical baggages and shouldn't be used for new files. This will be used to move "cgroup.event_control" from cgroup core to memcg. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Cc: Glauber Costa --- kernel/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7516668d8325..a41dc87cd07e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2756,7 +2756,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (cft->ss && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { strcpy(name, cft->ss->name); strcat(name, "."); } -- cgit From 7941cb027dccedec3c047271554ddcf4be2e0697 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_event hold onto cgroup_subsys_state instead of cgroup Currently, each registered cgroup_event holds an extra reference to the cgroup. This is a bit weird as events are subsystem specific and will also be incorrect in the planned unified hierarchy as css (cgroup_subsys_state) may come and go dynamically across the lifetime of a cgroup. Holding onto cgroup won't prevent the target css from going away. Update cgroup_event to hold onto the css the traget file belongs to instead of cgroup. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- kernel/cgroup.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a41dc87cd07e..12237a291d88 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3969,7 +3969,6 @@ static void cgroup_event_remove(struct work_struct *work) struct cgroup_event *event = container_of(work, struct cgroup_event, remove); struct cgroup_subsys_state *css = event->css; - struct cgroup *cgrp = css->cgroup; remove_wait_queue(event->wqh, &event->wait); @@ -3980,7 +3979,7 @@ static void cgroup_event_remove(struct work_struct *work) eventfd_ctx_put(event->eventfd); kfree(event); - cgroup_dput(cgrp); + css_put(css); } /* @@ -4103,12 +4102,16 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, goto out_put_cfile; } - /* determine the css of @cfile and associate @event with it */ + /* + * Determine the css of @cfile and associate @event with it. + * Remaining events are automatically removed on cgroup destruction + * but the removal is asynchronous, so take an extra ref. + */ rcu_read_lock(); ret = -EINVAL; event->css = cgroup_css(cgrp, event->cft->ss); - if (event->css) + if (event->css && css_tryget(event->css)) ret = 0; rcu_read_unlock(); @@ -4122,28 +4125,21 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); if (cgrp_cfile != cgrp) { ret = -EINVAL; - goto out_put_cfile; + goto out_put_css; } if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; - goto out_put_cfile; + goto out_put_css; } ret = event->cft->register_event(event->css, event->cft, event->eventfd, buffer); if (ret) - goto out_put_cfile; + goto out_put_css; efile->f_op->poll(efile, &event->pt); - /* - * Events should be removed after rmdir of cgroup directory, but before - * destroying subsystem state objects. Let's take reference to cgroup - * directory dentry to do that. - */ - dget(cgrp->dentry); - spin_lock(&cgrp->event_list_lock); list_add(&event->list, &cgrp->event_list); spin_unlock(&cgrp->event_list_lock); @@ -4153,6 +4149,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, return 0; +out_put_css: + css_put(event->css); out_put_cfile: fput(cfile); out_put_eventfd: -- cgit From 7c918cbbd829669bf70ffcc45962d5d992942243 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_write_event_control() use css_from_dir() instead of __d_cgrp() cgroup_event will be moved to its only user - memcg. Replace __d_cgrp() usage with css_from_dir(), which is already exported. This also simplifies the code a bit. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- kernel/cgroup.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 12237a291d88..e76698dd6c08 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4041,7 +4041,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, { struct cgroup *cgrp = dummy_css->cgroup; struct cgroup_event *event; - struct cgroup *cgrp_cfile; + struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; struct file *efile; struct file *cfile; @@ -4103,7 +4103,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, } /* - * Determine the css of @cfile and associate @event with it. + * Determine the css of @cfile, verify it belongs to the same + * cgroup as cgroup.event_control, and associate @event with it. * Remaining events are automatically removed on cgroup destruction * but the removal is asynchronous, so take an extra ref. */ @@ -4111,23 +4112,14 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, ret = -EINVAL; event->css = cgroup_css(cgrp, event->cft->ss); - if (event->css && css_tryget(event->css)) + cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss); + if (event->css && event->css == cfile_css && css_tryget(event->css)) ret = 0; rcu_read_unlock(); if (ret) goto out_put_cfile; - /* - * The file to be monitored must be in the same cgroup as - * cgroup.event_control is. - */ - cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); - if (cgrp_cfile != cgrp) { - ret = -EINVAL; - goto out_put_css; - } - if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; goto out_put_css; -- cgit From d1625964da51bda61306ad3ec45307a799c21f08 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Aug 2013 14:27:23 -0400 Subject: cgroup: fix cgroup_css() invocation in css_from_id() ca8bdcaff0 ("cgroup: make cgroup_css() take cgroup_subsys * instead and allow NULL subsys") missed one conversion in css_from_id(), which was newly added. As css_from_id() doesn't have any user yet, this doesn't break anything other than generating a build warning. Convert it. Signed-off-by: Tejun Heo Reported-by: Stephen Rothwell Reported-by: kbuild test robot --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e76698dd6c08..b5f4989937f2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5729,7 +5729,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) - return cgroup_css(cgrp, ss->subsys_id); + return cgroup_css(cgrp, ss); return NULL; } -- cgit