summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/kernfs/mount.c30
-rw-r--r--include/linux/kernfs.h1
-rw-r--r--kernel/cgroup.c58
-rw-r--r--kernel/cpuset.c20
-rw-r--r--mm/mempolicy.c2
5 files changed, 100 insertions, 11 deletions
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d171b98a6cdd..f973ae9b05f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb)
kernfs_put(root_kn);
}
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations. This can be used to block ->kill_sb() which may be useful
+ * for kernfs users which dynamically manage superblocks.
+ *
+ * Returns NULL if there's no superblock associated to this kernfs_root, or
+ * -EINVAL if the superblock is being freed.
+ */
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+ struct kernfs_super_info *info;
+ struct super_block *sb = NULL;
+
+ mutex_lock(&kernfs_mutex);
+ list_for_each_entry(info, &root->supers, node) {
+ if (info->ns == ns) {
+ sb = info->sb;
+ if (!atomic_inc_not_zero(&info->sb->s_active))
+ sb = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ mutex_unlock(&kernfs_mutex);
+ return sb;
+}
+
void __init kernfs_init(void)
{
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 145375ea0bd9..30faf797c2c3 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -305,6 +305,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
struct kernfs_root *root, unsigned long magic,
bool *new_sb_created, const void *ns);
void kernfs_kill_sb(struct super_block *sb);
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
void kernfs_init(void);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7868fc3c0bc5..70776aec2562 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ struct super_block *pinned_sb = NULL;
+ struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+ int i;
bool new_sb;
/*
@@ -1677,6 +1680,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
}
+ /*
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
+ */
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
+
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ cgroup_put(&ss->root->cgrp);
+ }
+
for_each_root(root) {
bool name_match = false;
@@ -1717,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
}
/*
- * A root's lifetime is governed by its root cgroup.
- * tryget_live failure indicate that the root is being
- * destroyed. Wait for destruction to complete so that the
- * subsystems are free. We can use wait_queue for the wait
- * but this path is super cold. Let's just sleep for a bit
- * and retry.
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, tryget_live can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
*/
- if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
msleep(10);
ret = restart_syscall();
goto out_free;
@@ -1770,6 +1802,16 @@ out_free:
CGROUP_SUPER_MAGIC, &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
+
+ /*
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
+ */
+ if (pinned_sb) {
+ WARN_ON(new_sb);
+ deactivate_super(pinned_sb);
+ }
+
return dentry;
}
@@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
rcu_read_lock();
css_for_each_child(child, css) {
- if (css->flags & CSS_ONLINE) {
+ if (child->flags & CSS_ONLINE) {
ret = true;
break;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f6b33c696224..116a4164720a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1181,7 +1181,13 @@ done:
int current_cpuset_is_being_rebound(void)
{
- return task_cs(current) == cpuset_being_rebound;
+ int ret;
+
+ rcu_read_lock();
+ ret = task_cs(current) == cpuset_being_rebound;
+ rcu_read_unlock();
+
+ return ret;
}
static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* resources, wait for the previously scheduled operations before
* proceeding, so that we don't end up keep removing tasks added
* after execution capability is restored.
+ *
+ * cpuset_hotplug_work calls back into cgroup core via
+ * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+ * operation like this one can lead to a deadlock through kernfs
+ * active_ref protection. Let's break the protection. Losing the
+ * protection is okay as we check whether @cs is online after
+ * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * hierarchies.
*/
+ css_get(&cs->css);
+ kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
mutex_lock(&cpuset_mutex);
@@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_trial_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
+ kernfs_unbreak_active_protection(of->kn);
+ css_put(&cs->css);
return retval ?: nbytes;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eb58de19f815..8f5330d74f47 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2139,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
} else
*new = *old;
- rcu_read_lock();
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
if (new->flags & MPOL_F_REBINDING)
@@ -2147,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
else
mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
}
- rcu_read_unlock();
atomic_set(&new->refcnt, 1);
return new;
}