summaryrefslogtreecommitdiff
path: root/fs/kernfs/dir.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/kernfs/dir.c')
-rw-r--r--fs/kernfs/dir.c979
1 files changed, 647 insertions, 332 deletions
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index db5900aaa55a..5c0efd6b239f 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* fs/kernfs/dir.c - kernfs directory implementation
*
* Copyright (c) 2001-3 Patrick Mochel
* Copyright (c) 2007 SUSE Linux Products GmbH
* Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
- *
- * This file is released under the GPLv2.
*/
#include <linux/sched.h>
@@ -18,18 +17,29 @@
#include "kernfs-internal.h"
-DEFINE_MUTEX(kernfs_mutex);
-static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
-static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */
+/*
+ * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
+ * call pr_cont() while holding rename_lock. Because sometimes pr_cont()
+ * will perform wakeups when releasing console_sem. Holding rename_lock
+ * will introduce deadlock if the scheduler reads the kernfs_name in the
+ * wakeup path.
+ */
+static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
+static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
-static bool kernfs_active(struct kernfs_node *kn)
+static bool __kernfs_active(struct kernfs_node *kn)
{
- lockdep_assert_held(&kernfs_mutex);
return atomic_read(&kn->active) >= 0;
}
+static bool kernfs_active(struct kernfs_node *kn)
+{
+ lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem);
+ return __kernfs_active(kn);
+}
+
static bool kernfs_lockdep(struct kernfs_node *kn)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -39,22 +49,14 @@ static bool kernfs_lockdep(struct kernfs_node *kn)
#endif
}
-static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
-{
- if (!kn)
- return strlcpy(buf, "(null)", buflen);
-
- return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
-}
-
/* kernfs_node_depth - compute depth from @from to @to */
static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
{
size_t depth = 0;
- while (to->parent && to != from) {
+ while (rcu_dereference(to->__parent) && to != from) {
depth++;
- to = to->parent;
+ to = rcu_dereference(to->__parent);
}
return depth;
}
@@ -72,18 +74,18 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
db = kernfs_depth(rb->kn, b);
while (da > db) {
- a = a->parent;
+ a = rcu_dereference(a->__parent);
da--;
}
while (db > da) {
- b = b->parent;
+ b = rcu_dereference(b->__parent);
db--;
}
/* worst case b and a will be the same at root */
while (b != a) {
- b = b->parent;
- a = a->parent;
+ b = rcu_dereference(b->__parent);
+ a = rcu_dereference(a->__parent);
}
return a;
@@ -113,9 +115,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
* kn_to: /n1/n2/n3 [depth=3]
* result: /../..
*
- * [3] when @kn_to is NULL result will be "(null)"
+ * [3] when @kn_to is %NULL result will be "(null)"
*
- * Returns the length of the full path. If the full length is equal to or
+ * Return: the length of the constructed path. If the path would have been
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -126,16 +128,17 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
struct kernfs_node *kn, *common;
const char parent_str[] = "/..";
size_t depth_from, depth_to, len = 0;
+ ssize_t copied;
int i, j;
if (!kn_to)
- return strlcpy(buf, "(null)", buflen);
+ return strscpy(buf, "(null)", buflen);
if (!kn_from)
kn_from = kernfs_root(kn_to)->kn;
if (kn_from == kn_to)
- return strlcpy(buf, "/", buflen);
+ return strscpy(buf, "/", buflen);
common = kernfs_common_ancestor(kn_from, kn_to);
if (WARN_ON(!common))
@@ -144,21 +147,24 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
depth_to = kernfs_depth(common, kn_to);
depth_from = kernfs_depth(common, kn_from);
- if (buf)
- buf[0] = '\0';
+ buf[0] = '\0';
- for (i = 0; i < depth_from; i++)
- len += strlcpy(buf + len, parent_str,
- len < buflen ? buflen - len : 0);
+ for (i = 0; i < depth_from; i++) {
+ copied = strscpy(buf + len, parent_str, buflen - len);
+ if (copied < 0)
+ return copied;
+ len += copied;
+ }
/* Calculate how many bytes we need for the rest */
for (i = depth_to - 1; i >= 0; i--) {
+ const char *name;
+
for (kn = kn_to, j = 0; j < i; j++)
- kn = kn->parent;
- len += strlcpy(buf + len, "/",
- len < buflen ? buflen - len : 0);
- len += strlcpy(buf + len, kn->name,
- len < buflen ? buflen - len : 0);
+ kn = rcu_dereference(kn->__parent);
+
+ name = rcu_dereference(kn->name);
+ len += scnprintf(buf + len, buflen - len, "/%s", name);
}
return len;
@@ -171,22 +177,29 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
* @buflen: size of @buf
*
* Copies the name of @kn into @buf of @buflen bytes. The behavior is
- * similar to strlcpy(). It returns the length of @kn's name and if @buf
- * isn't long enough, it's filled upto @buflen-1 and nul terminated.
+ * similar to strscpy().
+ *
+ * Fills buffer with "(null)" if @kn is %NULL.
*
- * Fills buffer with "(null)" if @kn is NULL.
+ * Return: the resulting length of @buf. If @buf isn't long enough,
+ * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG.
*
* This function can be called from any context.
*/
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{
- unsigned long flags;
- int ret;
+ struct kernfs_node *kn_parent;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
- ret = kernfs_name_locked(kn, buf, buflen);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
- return ret;
+ if (!kn)
+ return strscpy(buf, "(null)", buflen);
+
+ guard(rcu)();
+ /*
+ * KERNFS_ROOT_INVARIANT_PARENT is ignored here. The name is RCU freed and
+ * the parent is either existing or not.
+ */
+ kn_parent = rcu_dereference(kn->__parent);
+ return strscpy(buf, kn_parent ? rcu_dereference(kn->name) : "/", buflen);
}
/**
@@ -201,20 +214,24 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
* path (which includes '..'s) as needed to reach from @from to @to is
* returned.
*
- * Returns the length of the full path. If the full length is equal to or
+ * Return: the length of the constructed path. If the path would have been
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
char *buf, size_t buflen)
{
- unsigned long flags;
- int ret;
+ struct kernfs_root *root;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
- ret = kernfs_path_from_node_locked(to, from, buf, buflen);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
- return ret;
+ guard(rcu)();
+ if (to) {
+ root = kernfs_root(to);
+ if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) {
+ guard(read_lock_irqsave)(&root->kernfs_rename_lock);
+ return kernfs_path_from_node_locked(to, from, buf, buflen);
+ }
+ }
+ return kernfs_path_from_node_locked(to, from, buf, buflen);
}
EXPORT_SYMBOL_GPL(kernfs_path_from_node);
@@ -228,12 +245,12 @@ void pr_cont_kernfs_name(struct kernfs_node *kn)
{
unsigned long flags;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
+ spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
- kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
+ kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
pr_cont("%s", kernfs_pr_cont_buf);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}
/**
@@ -247,24 +264,22 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
unsigned long flags;
int sz;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
+ spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
- sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
- sizeof(kernfs_pr_cont_buf));
+ sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
+ sizeof(kernfs_pr_cont_buf));
if (sz < 0) {
- pr_cont("(error)");
- goto out;
- }
-
- if (sz >= sizeof(kernfs_pr_cont_buf)) {
- pr_cont("(name too long)");
+ if (sz == -E2BIG)
+ pr_cont("(name too long)");
+ else
+ pr_cont("(error)");
goto out;
}
pr_cont("%s", kernfs_pr_cont_buf);
out:
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}
/**
@@ -273,26 +288,30 @@ out:
*
* Determines @kn's parent, pins and returns it. This function can be
* called from any context.
+ *
+ * Return: parent node of @kn
*/
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
struct kernfs_node *parent;
+ struct kernfs_root *root;
unsigned long flags;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
- parent = kn->parent;
+ root = kernfs_root(kn);
+ read_lock_irqsave(&root->kernfs_rename_lock, flags);
+ parent = kernfs_parent(kn);
kernfs_get(parent);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ read_unlock_irqrestore(&root->kernfs_rename_lock, flags);
return parent;
}
/**
- * kernfs_name_hash
+ * kernfs_name_hash - calculate hash of @ns + @name
* @name: Null terminated string to hash
* @ns: Namespace tag to hash
*
- * Returns 31 bit hash of ns + name (so it fits in an off_t )
+ * Return: 31-bit hash of ns + name (so it fits in an off_t)
*/
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
@@ -321,13 +340,13 @@ static int kernfs_name_compare(unsigned int hash, const char *name,
return -1;
if (ns > kn->ns)
return 1;
- return strcmp(name, kn->name);
+ return strcmp(name, kernfs_rcu_name(kn));
}
static int kernfs_sd_compare(const struct kernfs_node *left,
const struct kernfs_node *right)
{
- return kernfs_name_compare(left->hash, left->name, left->ns, right);
+ return kernfs_name_compare(left->hash, kernfs_rcu_name(left), left->ns, right);
}
/**
@@ -338,15 +357,19 @@ static int kernfs_sd_compare(const struct kernfs_node *left,
* @kn->parent->dir.children.
*
* Locking:
- * mutex_lock(kernfs_mutex)
+ * kernfs_rwsem held exclusive
*
- * RETURNS:
- * 0 on susccess -EEXIST on failure.
+ * Return:
+ * %0 on success, -EEXIST on failure.
*/
static int kernfs_link_sibling(struct kernfs_node *kn)
{
- struct rb_node **node = &kn->parent->dir.children.rb_node;
struct rb_node *parent = NULL;
+ struct kernfs_node *kn_parent;
+ struct rb_node **node;
+
+ kn_parent = kernfs_parent(kn);
+ node = &kn_parent->dir.children.rb_node;
while (*node) {
struct kernfs_node *pos;
@@ -365,11 +388,14 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
/* add new node and rebalance the tree */
rb_link_node(&kn->rb, parent, node);
- rb_insert_color(&kn->rb, &kn->parent->dir.children);
+ rb_insert_color(&kn->rb, &kn_parent->dir.children);
/* successfully added, account subdir number */
+ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
if (kernfs_type(kn) == KERNFS_DIR)
- kn->parent->dir.subdirs++;
+ kn_parent->dir.subdirs++;
+ kernfs_inc_rev(kn_parent);
+ up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
return 0;
}
@@ -379,21 +405,29 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
* @kn: kernfs_node of interest
*
* Try to unlink @kn from its sibling rbtree which starts from
- * kn->parent->dir.children. Returns %true if @kn was actually
- * removed, %false if @kn wasn't on the rbtree.
+ * kn->parent->dir.children.
+ *
+ * Return: %true if @kn was actually removed,
+ * %false if @kn wasn't on the rbtree.
*
* Locking:
- * mutex_lock(kernfs_mutex)
+ * kernfs_rwsem held exclusive
*/
static bool kernfs_unlink_sibling(struct kernfs_node *kn)
{
+ struct kernfs_node *kn_parent;
+
if (RB_EMPTY_NODE(&kn->rb))
return false;
+ kn_parent = kernfs_parent(kn);
+ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
if (kernfs_type(kn) == KERNFS_DIR)
- kn->parent->dir.subdirs--;
+ kn_parent->dir.subdirs--;
+ kernfs_inc_rev(kn_parent);
+ up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
- rb_erase(&kn->rb, &kn->parent->dir.children);
+ rb_erase(&kn->rb, &kn_parent->dir.children);
RB_CLEAR_NODE(&kn->rb);
return true;
}
@@ -403,10 +437,10 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn)
* @kn: kernfs_node to get an active reference to
*
* Get an active reference of @kn. This function is noop if @kn
- * is NULL.
+ * is %NULL.
*
- * RETURNS:
- * Pointer to @kn on success, NULL on failure.
+ * Return:
+ * Pointer to @kn on success, %NULL on failure.
*/
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
{
@@ -426,42 +460,52 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
* @kn: kernfs_node to put an active reference to
*
* Put an active reference to @kn. This function is noop if @kn
- * is NULL.
+ * is %NULL.
*/
void kernfs_put_active(struct kernfs_node *kn)
{
- struct kernfs_root *root = kernfs_root(kn);
int v;
if (unlikely(!kn))
return;
if (kernfs_lockdep(kn))
- rwsem_release(&kn->dep_map, 1, _RET_IP_);
+ rwsem_release(&kn->dep_map, _RET_IP_);
v = atomic_dec_return(&kn->active);
if (likely(v != KN_DEACTIVATED_BIAS))
return;
- wake_up_all(&root->deactivate_waitq);
+ wake_up_all(&kernfs_root(kn)->deactivate_waitq);
}
/**
* kernfs_drain - drain kernfs_node
* @kn: kernfs_node to drain
*
- * Drain existing usages and nuke all existing mmaps of @kn. Mutiple
+ * Drain existing usages and nuke all existing mmaps of @kn. Multiple
* removers may invoke this function concurrently on @kn and all will
* return after draining is complete.
*/
static void kernfs_drain(struct kernfs_node *kn)
- __releases(&kernfs_mutex) __acquires(&kernfs_mutex)
+ __releases(&kernfs_root(kn)->kernfs_rwsem)
+ __acquires(&kernfs_root(kn)->kernfs_rwsem)
{
struct kernfs_root *root = kernfs_root(kn);
- lockdep_assert_held(&kernfs_mutex);
+ lockdep_assert_held_write(&root->kernfs_rwsem);
WARN_ON_ONCE(kernfs_active(kn));
- mutex_unlock(&kernfs_mutex);
+ /*
+ * Skip draining if already fully drained. This avoids draining and its
+ * lockdep annotations for nodes which have never been activated
+ * allowing embedding kernfs_remove() in create error paths without
+ * worrying about draining.
+ */
+ if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS &&
+ !kernfs_should_drain_open_files(kn))
+ return;
+
+ up_write(&root->kernfs_rwsem);
if (kernfs_lockdep(kn)) {
rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
@@ -469,18 +513,18 @@ static void kernfs_drain(struct kernfs_node *kn)
lock_contended(&kn->dep_map, _RET_IP_);
}
- /* but everyone should wait for draining */
wait_event(root->deactivate_waitq,
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
if (kernfs_lockdep(kn)) {
lock_acquired(&kn->dep_map, _RET_IP_);
- rwsem_release(&kn->dep_map, 1, _RET_IP_);
+ rwsem_release(&kn->dep_map, _RET_IP_);
}
- kernfs_drain_open_files(kn);
+ if (kernfs_should_drain_open_files(kn))
+ kernfs_drain_open_files(kn);
- mutex_lock(&kernfs_mutex);
+ down_write(&root->kernfs_rwsem);
}
/**
@@ -496,6 +540,21 @@ void kernfs_get(struct kernfs_node *kn)
}
EXPORT_SYMBOL_GPL(kernfs_get);
+static void kernfs_free_rcu(struct rcu_head *rcu)
+{
+ struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);
+
+ /* If the whole node goes away, then name can't be used outside */
+ kfree_const(rcu_access_pointer(kn->name));
+
+ if (kn->iattr) {
+ simple_xattrs_free(&kn->iattr->xattrs, NULL);
+ kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
+ }
+
+ kmem_cache_free(kernfs_node_cache, kn);
+}
+
/**
* kernfs_put - put a reference count on a kernfs_node
* @kn: the target kernfs_node
@@ -515,26 +574,21 @@ void kernfs_put(struct kernfs_node *kn)
* Moving/renaming is always done while holding reference.
* kn->parent won't change beneath us.
*/
- parent = kn->parent;
+ parent = kernfs_parent(kn);
WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
"kernfs_put: %s/%s: released with incorrect active_ref %d\n",
- parent ? parent->name : "", kn->name, atomic_read(&kn->active));
+ parent ? rcu_dereference(parent->name) : "",
+ rcu_dereference(kn->name), atomic_read(&kn->active));
if (kernfs_type(kn) == KERNFS_LINK)
kernfs_put(kn->symlink.target_kn);
- kfree_const(kn->name);
+ spin_lock(&root->kernfs_idr_lock);
+ idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
+ spin_unlock(&root->kernfs_idr_lock);
- if (kn->iattr) {
- if (kn->iattr->ia_secdata)
- security_release_secctx(kn->iattr->ia_secdata,
- kn->iattr->ia_secdata_len);
- simple_xattrs_free(&kn->iattr->xattrs);
- }
- kfree(kn->iattr);
- ida_simple_remove(&root->ino_ida, kn->ino);
- kmem_cache_free(kernfs_node_cache, kn);
+ call_rcu(&kn->rcu, kernfs_free_rcu);
kn = parent;
if (kn) {
@@ -542,66 +596,17 @@ void kernfs_put(struct kernfs_node *kn)
goto repeat;
} else {
/* just released the root kn, free @root too */
- ida_destroy(&root->ino_ida);
- kfree(root);
+ idr_destroy(&root->ino_idr);
+ kfree_rcu(root, rcu);
}
}
EXPORT_SYMBOL_GPL(kernfs_put);
-static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
-{
- struct kernfs_node *kn;
-
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- /* Always perform fresh lookup for negatives */
- if (d_really_is_negative(dentry))
- goto out_bad_unlocked;
-
- kn = dentry->d_fsdata;
- mutex_lock(&kernfs_mutex);
-
- /* The kernfs node has been deactivated */
- if (!kernfs_active(kn))
- goto out_bad;
-
- /* The kernfs node has been moved? */
- if (dentry->d_parent->d_fsdata != kn->parent)
- goto out_bad;
-
- /* The kernfs node has been renamed */
- if (strcmp(dentry->d_name.name, kn->name) != 0)
- goto out_bad;
-
- /* The kernfs node has been moved to a different namespace */
- if (kn->parent && kernfs_ns_enabled(kn->parent) &&
- kernfs_info(dentry->d_sb)->ns != kn->ns)
- goto out_bad;
-
- mutex_unlock(&kernfs_mutex);
- return 1;
-out_bad:
- mutex_unlock(&kernfs_mutex);
-out_bad_unlocked:
- return 0;
-}
-
-static void kernfs_dop_release(struct dentry *dentry)
-{
- kernfs_put(dentry->d_fsdata);
-}
-
-const struct dentry_operations kernfs_dops = {
- .d_revalidate = kernfs_dop_revalidate,
- .d_release = kernfs_dop_release,
-};
-
/**
* kernfs_node_from_dentry - determine kernfs_node associated with a dentry
* @dentry: the dentry in question
*
- * Return the kernfs_node associated with @dentry. If @dentry is not a
+ * Return: the kernfs_node associated with @dentry. If @dentry is not a
* kernfs one, %NULL is returned.
*
* While the returned kernfs_node will stay accessible as long as @dentry
@@ -611,15 +616,18 @@ const struct dentry_operations kernfs_dops = {
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{
if (dentry->d_sb->s_op == &kernfs_sops)
- return dentry->d_fsdata;
+ return kernfs_dentry_node(dentry);
return NULL;
}
static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
+ struct kernfs_node *parent,
const char *name, umode_t mode,
+ kuid_t uid, kgid_t gid,
unsigned flags)
{
struct kernfs_node *kn;
+ u32 id_highbits;
int ret;
name = kstrdup_const(name, GFP_KERNEL);
@@ -630,21 +638,55 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
if (!kn)
goto err_out1;
- ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
+ idr_preload(GFP_KERNEL);
+ spin_lock(&root->kernfs_idr_lock);
+ ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
+ if (ret >= 0 && ret < root->last_id_lowbits)
+ root->id_highbits++;
+ id_highbits = root->id_highbits;
+ root->last_id_lowbits = ret;
+ spin_unlock(&root->kernfs_idr_lock);
+ idr_preload_end();
if (ret < 0)
goto err_out2;
- kn->ino = ret;
+
+ kn->id = (u64)id_highbits << 32 | ret;
atomic_set(&kn->count, 1);
atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
RB_CLEAR_NODE(&kn->rb);
- kn->name = name;
+ rcu_assign_pointer(kn->name, name);
kn->mode = mode;
kn->flags = flags;
+ if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) {
+ struct iattr iattr = {
+ .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = uid,
+ .ia_gid = gid,
+ };
+
+ ret = __kernfs_setattr(kn, &iattr);
+ if (ret < 0)
+ goto err_out3;
+ }
+
+ if (parent) {
+ ret = security_kernfs_init_security(parent, kn);
+ if (ret)
+ goto err_out4;
+ }
+
return kn;
+ err_out4:
+ simple_xattrs_free(&kn->iattr->xattrs, NULL);
+ kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
+ err_out3:
+ spin_lock(&root->kernfs_idr_lock);
+ idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
+ spin_unlock(&root->kernfs_idr_lock);
err_out2:
kmem_cache_free(kernfs_node_cache, kn);
err_out1:
@@ -654,18 +696,81 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
const char *name, umode_t mode,
+ kuid_t uid, kgid_t gid,
unsigned flags)
{
struct kernfs_node *kn;
- kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
+ if (parent->mode & S_ISGID) {
+ /* this code block imitates inode_init_owner() for
+ * kernfs
+ */
+
+ if (parent->iattr)
+ gid = parent->iattr->ia_gid;
+
+ if (flags & KERNFS_DIR)
+ mode |= S_ISGID;
+ }
+
+ kn = __kernfs_new_node(kernfs_root(parent), parent,
+ name, mode, uid, gid, flags);
if (kn) {
kernfs_get(parent);
- kn->parent = parent;
+ rcu_assign_pointer(kn->__parent, parent);
}
return kn;
}
+/*
+ * kernfs_find_and_get_node_by_id - get kernfs_node from node id
+ * @root: the kernfs root
+ * @id: the target node id
+ *
+ * @id's lower 32bits encode ino and upper gen. If the gen portion is
+ * zero, all generations are matched.
+ *
+ * Return: %NULL on failure,
+ * otherwise a kernfs node with reference counter incremented.
+ */
+struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
+ u64 id)
+{
+ struct kernfs_node *kn;
+ ino_t ino = kernfs_id_ino(id);
+ u32 gen = kernfs_id_gen(id);
+
+ rcu_read_lock();
+
+ kn = idr_find(&root->ino_idr, (u32)ino);
+ if (!kn)
+ goto err_unlock;
+
+ if (sizeof(ino_t) >= sizeof(u64)) {
+ /* we looked up with the low 32bits, compare the whole */
+ if (kernfs_ino(kn) != ino)
+ goto err_unlock;
+ } else {
+ /* 0 matches all generations */
+ if (unlikely(gen && kernfs_gen(kn) != gen))
+ goto err_unlock;
+ }
+
+ /*
+ * We should fail if @kn has never been activated and guarantee success
+ * if the caller knows that @kn is active. Both can be achieved by
+ * __kernfs_active() which tests @kn->active without kernfs_rwsem.
+ */
+ if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
+ goto err_unlock;
+
+ rcu_read_unlock();
+ return kn;
+err_unlock:
+ rcu_read_unlock();
+ return NULL;
+}
+
/**
* kernfs_add_one - add kernfs_node to parent without warning
* @kn: kernfs_node to be added
@@ -674,50 +779,52 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
* function increments nlink of the parent's inode if @kn is a
* directory and link into the children list of the parent.
*
- * RETURNS:
- * 0 on success, -EEXIST if entry with the given name already
+ * Return:
+ * %0 on success, -EEXIST if entry with the given name already
* exists.
*/
int kernfs_add_one(struct kernfs_node *kn)
{
- struct kernfs_node *parent = kn->parent;
+ struct kernfs_root *root = kernfs_root(kn);
struct kernfs_iattrs *ps_iattr;
+ struct kernfs_node *parent;
bool has_ns;
int ret;
- mutex_lock(&kernfs_mutex);
+ down_write(&root->kernfs_rwsem);
+ parent = kernfs_parent(kn);
ret = -EINVAL;
has_ns = kernfs_ns_enabled(parent);
if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
- has_ns ? "required" : "invalid", parent->name, kn->name))
+ has_ns ? "required" : "invalid",
+ kernfs_rcu_name(parent), kernfs_rcu_name(kn)))
goto out_unlock;
if (kernfs_type(parent) != KERNFS_DIR)
goto out_unlock;
ret = -ENOENT;
- if (parent->flags & KERNFS_EMPTY_DIR)
- goto out_unlock;
-
- if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
+ if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR))
goto out_unlock;
- kn->hash = kernfs_name_hash(kn->name, kn->ns);
+ kn->hash = kernfs_name_hash(kernfs_rcu_name(kn), kn->ns);
ret = kernfs_link_sibling(kn);
if (ret)
goto out_unlock;
/* Update timestamps on the parent */
+ down_write(&root->kernfs_iattr_rwsem);
+
ps_iattr = parent->iattr;
if (ps_iattr) {
- struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
- ktime_get_real_ts(&ps_iattrs->ia_ctime);
- ps_iattrs->ia_mtime = ps_iattrs->ia_ctime;
+ ktime_get_real_ts64(&ps_iattr->ia_ctime);
+ ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
- mutex_unlock(&kernfs_mutex);
+ up_write(&root->kernfs_iattr_rwsem);
+ up_write(&root->kernfs_rwsem);
/*
* Activate the new node unless CREATE_DEACTIVATED is requested.
@@ -731,7 +838,7 @@ int kernfs_add_one(struct kernfs_node *kn)
return 0;
out_unlock:
- mutex_unlock(&kernfs_mutex);
+ up_write(&root->kernfs_rwsem);
return ret;
}
@@ -741,8 +848,9 @@ out_unlock:
* @name: name to look for
* @ns: the namespace tag to use
*
- * Look for kernfs_node with name @name under @parent. Returns pointer to
- * the found kernfs_node on success, %NULL on failure.
+ * Look for kernfs_node with name @name under @parent.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
const unsigned char *name,
@@ -752,11 +860,11 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
bool has_ns = kernfs_ns_enabled(parent);
unsigned int hash;
- lockdep_assert_held(&kernfs_mutex);
+ lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem);
if (has_ns != (bool)ns) {
WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
- has_ns ? "required" : "invalid", parent->name, name);
+ has_ns ? "required" : "invalid", kernfs_rcu_name(parent), name);
return NULL;
}
@@ -781,18 +889,17 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
const unsigned char *path,
const void *ns)
{
- size_t len;
+ ssize_t len;
char *p, *name;
- lockdep_assert_held(&kernfs_mutex);
+ lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
- /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
- spin_lock_irq(&kernfs_rename_lock);
+ spin_lock_irq(&kernfs_pr_cont_lock);
- len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
+ len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
- if (len >= sizeof(kernfs_pr_cont_buf)) {
- spin_unlock_irq(&kernfs_rename_lock);
+ if (len < 0) {
+ spin_unlock_irq(&kernfs_pr_cont_lock);
return NULL;
}
@@ -804,7 +911,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
parent = kernfs_find_ns(parent, name, ns);
}
- spin_unlock_irq(&kernfs_rename_lock);
+ spin_unlock_irq(&kernfs_pr_cont_lock);
return parent;
}
@@ -816,18 +923,20 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
* @ns: the namespace tag to use
*
* Look for kernfs_node with name @name under @parent and get a reference
- * if found. This function may sleep and returns pointer to the found
- * kernfs_node on success, %NULL on failure.
+ * if found. This function may sleep.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
const char *name, const void *ns)
{
struct kernfs_node *kn;
+ struct kernfs_root *root = kernfs_root(parent);
- mutex_lock(&kernfs_mutex);
+ down_read(&root->kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
kernfs_get(kn);
- mutex_unlock(&kernfs_mutex);
+ up_read(&root->kernfs_rwsem);
return kn;
}
@@ -840,29 +949,36 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
* @ns: the namespace tag to use
*
* Look for kernfs_node with path @path under @parent and get a reference
- * if found. This function may sleep and returns pointer to the found
- * kernfs_node on success, %NULL on failure.
+ * if found. This function may sleep.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
const char *path, const void *ns)
{
struct kernfs_node *kn;
+ struct kernfs_root *root = kernfs_root(parent);
- mutex_lock(&kernfs_mutex);
+ down_read(&root->kernfs_rwsem);
kn = kernfs_walk_ns(parent, path, ns);
kernfs_get(kn);
- mutex_unlock(&kernfs_mutex);
+ up_read(&root->kernfs_rwsem);
return kn;
}
+unsigned int kernfs_root_flags(struct kernfs_node *kn)
+{
+ return kernfs_root(kn)->flags;
+}
+
/**
* kernfs_create_root - create a new kernfs hierarchy
* @scops: optional syscall operations for the hierarchy
* @flags: KERNFS_ROOT_* flags
* @priv: opaque data associated with the new directory
*
- * Returns the root of the new hierarchy on success, ERR_PTR() value on
+ * Return: the root of the new hierarchy on success, ERR_PTR() value on
* failure.
*/
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
@@ -875,13 +991,30 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
if (!root)
return ERR_PTR(-ENOMEM);
- ida_init(&root->ino_ida);
+ idr_init(&root->ino_idr);
+ spin_lock_init(&root->kernfs_idr_lock);
+ init_rwsem(&root->kernfs_rwsem);
+ init_rwsem(&root->kernfs_iattr_rwsem);
+ init_rwsem(&root->kernfs_supers_rwsem);
INIT_LIST_HEAD(&root->supers);
+ rwlock_init(&root->kernfs_rename_lock);
- kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
+ /*
+ * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino.
+ * High bits generation. The starting value for both ino and
+ * genenration is 1. Initialize upper 32bit allocation
+ * accordingly.
+ */
+ if (sizeof(ino_t) >= sizeof(u64))
+ root->id_highbits = 0;
+ else
+ root->id_highbits = 1;
+
+ kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
KERNFS_DIR);
if (!kn) {
- ida_destroy(&root->ino_ida);
+ idr_destroy(&root->ino_idr);
kfree(root);
return ERR_PTR(-ENOMEM);
}
@@ -909,7 +1042,24 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
*/
void kernfs_destroy_root(struct kernfs_root *root)
{
- kernfs_remove(root->kn); /* will also free @root */
+ /*
+ * kernfs_remove holds kernfs_rwsem from the root so the root
+ * shouldn't be freed during the operation.
+ */
+ kernfs_get(root->kn);
+ kernfs_remove(root->kn);
+ kernfs_put(root->kn); /* will also free @root */
+}
+
+/**
+ * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root
+ * @root: root to use to lookup
+ *
+ * Return: @root's kernfs_node
+ */
+struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root)
+{
+ return root->kn;
}
/**
@@ -917,20 +1067,24 @@ void kernfs_destroy_root(struct kernfs_root *root)
* @parent: parent in which to create a new directory
* @name: name of the new directory
* @mode: mode of the new directory
+ * @uid: uid of the new directory
+ * @gid: gid of the new directory
* @priv: opaque data associated with the new directory
* @ns: optional namespace tag of the directory
*
- * Returns the created node on success, ERR_PTR() value on failure.
+ * Return: the created node on success, ERR_PTR() value on failure.
*/
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
const char *name, umode_t mode,
+ kuid_t uid, kgid_t gid,
void *priv, const void *ns)
{
struct kernfs_node *kn;
int rc;
/* allocate */
- kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
+ kn = kernfs_new_node(parent, name, mode | S_IFDIR,
+ uid, gid, KERNFS_DIR);
if (!kn)
return ERR_PTR(-ENOMEM);
@@ -952,7 +1106,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
* @parent: parent in which to create a new directory
* @name: name of the new directory
*
- * Returns the created node on success, ERR_PTR() value on failure.
+ * Return: the created node on success, ERR_PTR() value on failure.
*/
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
const char *name)
@@ -961,7 +1115,8 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
int rc;
/* allocate */
- kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR);
+ kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR);
if (!kn)
return ERR_PTR(-ENOMEM);
@@ -979,67 +1134,149 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
return ERR_PTR(rc);
}
+static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
+{
+ struct kernfs_node *kn, *parent;
+ struct kernfs_root *root;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ /* Negative hashed dentry? */
+ if (d_really_is_negative(dentry)) {
+ /* If the kernfs parent node has changed discard and
+ * proceed to ->lookup.
+ *
+ * There's nothing special needed here when getting the
+ * dentry parent, even if a concurrent rename is in
+ * progress. That's because the dentry is negative so
+ * it can only be the target of the rename and it will
+ * be doing a d_move() not a replace. Consequently the
+ * dentry d_parent won't change over the d_move().
+ *
+ * Also kernfs negative dentries transitioning from
+ * negative to positive during revalidate won't happen
+ * because they are invalidated on containing directory
+ * changes and the lookup re-done so that a new positive
+ * dentry can be properly created.
+ */
+ root = kernfs_root_from_sb(dentry->d_sb);
+ down_read(&root->kernfs_rwsem);
+ parent = kernfs_dentry_node(dentry->d_parent);
+ if (parent) {
+ if (kernfs_dir_changed(parent, dentry)) {
+ up_read(&root->kernfs_rwsem);
+ return 0;
+ }
+ }
+ up_read(&root->kernfs_rwsem);
+
+ /* The kernfs parent node hasn't changed, leave the
+ * dentry negative and return success.
+ */
+ return 1;
+ }
+
+ kn = kernfs_dentry_node(dentry);
+ root = kernfs_root(kn);
+ down_read(&root->kernfs_rwsem);
+
+ /* The kernfs node has been deactivated */
+ if (!kernfs_active(kn))
+ goto out_bad;
+
+ parent = kernfs_parent(kn);
+ /* The kernfs node has been moved? */
+ if (kernfs_dentry_node(dentry->d_parent) != parent)
+ goto out_bad;
+
+ /* The kernfs node has been renamed */
+ if (strcmp(dentry->d_name.name, kernfs_rcu_name(kn)) != 0)
+ goto out_bad;
+
+ /* The kernfs node has been moved to a different namespace */
+ if (parent && kernfs_ns_enabled(parent) &&
+ kernfs_info(dentry->d_sb)->ns != kn->ns)
+ goto out_bad;
+
+ up_read(&root->kernfs_rwsem);
+ return 1;
+out_bad:
+ up_read(&root->kernfs_rwsem);
+ return 0;
+}
+
+const struct dentry_operations kernfs_dops = {
+ .d_revalidate = kernfs_dop_revalidate,
+};
+
static struct dentry *kernfs_iop_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
{
- struct dentry *ret;
- struct kernfs_node *parent = dentry->d_parent->d_fsdata;
+ struct kernfs_node *parent = dir->i_private;
struct kernfs_node *kn;
- struct inode *inode;
+ struct kernfs_root *root;
+ struct inode *inode = NULL;
const void *ns = NULL;
- mutex_lock(&kernfs_mutex);
-
+ root = kernfs_root(parent);
+ down_read(&root->kernfs_rwsem);
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dir->i_sb)->ns;
kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
-
- /* no such entry */
- if (!kn || !kernfs_active(kn)) {
- ret = NULL;
- goto out_unlock;
- }
- kernfs_get(kn);
- dentry->d_fsdata = kn;
-
/* attach dentry and inode */
- inode = kernfs_get_inode(dir->i_sb, kn);
- if (!inode) {
- ret = ERR_PTR(-ENOMEM);
- goto out_unlock;
+ if (kn) {
+ /* Inactive nodes are invisible to the VFS so don't
+ * create a negative.
+ */
+ if (!kernfs_active(kn)) {
+ up_read(&root->kernfs_rwsem);
+ return NULL;
+ }
+ inode = kernfs_get_inode(dir->i_sb, kn);
+ if (!inode)
+ inode = ERR_PTR(-ENOMEM);
}
+ /*
+ * Needed for negative dentry validation.
+ * The negative dentry can be created in kernfs_iop_lookup()
+ * or transforms from positive dentry in dentry_unlink_inode()
+ * called from vfs_rmdir().
+ */
+ if (!IS_ERR(inode))
+ kernfs_set_rev(parent, dentry);
+ up_read(&root->kernfs_rwsem);
- /* instantiate and hash dentry */
- ret = d_splice_alias(inode, dentry);
- out_unlock:
- mutex_unlock(&kernfs_mutex);
- return ret;
+ /* instantiate and hash (possibly negative) dentry */
+ return d_splice_alias(inode, dentry);
}
-static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
- umode_t mode)
+static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode)
{
struct kernfs_node *parent = dir->i_private;
struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
int ret;
if (!scops || !scops->mkdir)
- return -EPERM;
+ return ERR_PTR(-EPERM);
if (!kernfs_get_active(parent))
- return -ENODEV;
+ return ERR_PTR(-ENODEV);
ret = scops->mkdir(parent, dentry->d_name.name, mode);
kernfs_put_active(parent);
- return ret;
+ return ERR_PTR(ret);
}
static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
{
- struct kernfs_node *kn = dentry->d_fsdata;
+ struct kernfs_node *kn = kernfs_dentry_node(dentry);
struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
int ret;
@@ -1055,11 +1292,12 @@ static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
return ret;
}
-static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int kernfs_iop_rename(struct mnt_idmap *idmap,
+ struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- struct kernfs_node *kn = old_dentry->d_fsdata;
+ struct kernfs_node *kn = kernfs_dentry_node(old_dentry);
struct kernfs_node *new_parent = new_dir->i_private;
struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
int ret;
@@ -1127,13 +1365,15 @@ static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
* Find the next descendant to visit for post-order traversal of @root's
* descendants. @root is included in the iteration and the last node to be
* visited.
+ *
+ * Return: the next descendant to visit or %NULL when done.
*/
static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
struct kernfs_node *root)
{
struct rb_node *rbn;
- lockdep_assert_held(&kernfs_mutex);
+ lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem);
/* if first iteration, visit leftmost descendant which may be root */
if (!pos)
@@ -1149,7 +1389,22 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
return kernfs_leftmost_descendant(rb_to_kn(rbn));
/* no sibling left, visit parent */
- return pos->parent;
+ return kernfs_parent(pos);
+}
+
+static void kernfs_activate_one(struct kernfs_node *kn)
+{
+ lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
+
+ kn->flags |= KERNFS_ACTIVATED;
+
+ if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING)))
+ return;
+
+ WARN_ON_ONCE(rcu_access_pointer(kn->__parent) && RB_EMPTY_NODE(&kn->rb));
+ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+
+ atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
}
/**
@@ -1168,84 +1423,110 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
void kernfs_activate(struct kernfs_node *kn)
{
struct kernfs_node *pos;
+ struct kernfs_root *root = kernfs_root(kn);
- mutex_lock(&kernfs_mutex);
+ down_write(&root->kernfs_rwsem);
pos = NULL;
- while ((pos = kernfs_next_descendant_post(pos, kn))) {
- if (!pos || (pos->flags & KERNFS_ACTIVATED))
- continue;
+ while ((pos = kernfs_next_descendant_post(pos, kn)))
+ kernfs_activate_one(pos);
+
+ up_write(&root->kernfs_rwsem);
+}
+
+/**
+ * kernfs_show - show or hide a node
+ * @kn: kernfs_node to show or hide
+ * @show: whether to show or hide
+ *
+ * If @show is %false, @kn is marked hidden and deactivated. A hidden node is
+ * ignored in future activaitons. If %true, the mark is removed and activation
+ * state is restored. This function won't implicitly activate a new node in a
+ * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet.
+ *
+ * To avoid recursion complexities, directories aren't supported for now.
+ */
+void kernfs_show(struct kernfs_node *kn, bool show)
+{
+ struct kernfs_root *root = kernfs_root(kn);
- WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
- WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);
+ if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR))
+ return;
- atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
- pos->flags |= KERNFS_ACTIVATED;
+ down_write(&root->kernfs_rwsem);
+
+ if (show) {
+ kn->flags &= ~KERNFS_HIDDEN;
+ if (kn->flags & KERNFS_ACTIVATED)
+ kernfs_activate_one(kn);
+ } else {
+ kn->flags |= KERNFS_HIDDEN;
+ if (kernfs_active(kn))
+ atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
+ kernfs_drain(kn);
}
- mutex_unlock(&kernfs_mutex);
+ up_write(&root->kernfs_rwsem);
}
static void __kernfs_remove(struct kernfs_node *kn)
{
- struct kernfs_node *pos;
+ struct kernfs_node *pos, *parent;
- lockdep_assert_held(&kernfs_mutex);
+ /* Short-circuit if non-root @kn has already finished removal. */
+ if (!kn)
+ return;
+
+ lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
/*
- * Short-circuit if non-root @kn has already finished removal.
* This is for kernfs_remove_self() which plays with active ref
* after removal.
*/
- if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
+ if (kernfs_parent(kn) && RB_EMPTY_NODE(&kn->rb))
return;
- pr_debug("kernfs %s: removing\n", kn->name);
+ pr_debug("kernfs %s: removing\n", kernfs_rcu_name(kn));
- /* prevent any new usage under @kn by deactivating all nodes */
+ /* prevent new usage by marking all nodes removing and deactivating */
pos = NULL;
- while ((pos = kernfs_next_descendant_post(pos, kn)))
+ while ((pos = kernfs_next_descendant_post(pos, kn))) {
+ pos->flags |= KERNFS_REMOVING;
if (kernfs_active(pos))
atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
+ }
/* deactivate and unlink the subtree node-by-node */
do {
pos = kernfs_leftmost_descendant(kn);
/*
- * kernfs_drain() drops kernfs_mutex temporarily and @pos's
+ * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's
* base ref could have been put by someone else by the time
* the function returns. Make sure it doesn't go away
* underneath us.
*/
kernfs_get(pos);
- /*
- * Drain iff @kn was activated. This avoids draining and
- * its lockdep annotations for nodes which have never been
- * activated and allows embedding kernfs_remove() in create
- * error paths without worrying about draining.
- */
- if (kn->flags & KERNFS_ACTIVATED)
- kernfs_drain(pos);
- else
- WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
-
+ kernfs_drain(pos);
+ parent = kernfs_parent(pos);
/*
* kernfs_unlink_sibling() succeeds once per node. Use it
* to decide who's responsible for cleanups.
*/
- if (!pos->parent || kernfs_unlink_sibling(pos)) {
+ if (!parent || kernfs_unlink_sibling(pos)) {
struct kernfs_iattrs *ps_iattr =
- pos->parent ? pos->parent->iattr : NULL;
+ parent ? parent->iattr : NULL;
/* update timestamps on the parent */
+ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
+
if (ps_iattr) {
- ktime_get_real_ts(&ps_iattr->ia_iattr.ia_ctime);
- ps_iattr->ia_iattr.ia_mtime =
- ps_iattr->ia_iattr.ia_ctime;
+ ktime_get_real_ts64(&ps_iattr->ia_ctime);
+ ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
+ up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
kernfs_put(pos);
}
@@ -1261,9 +1542,16 @@ static void __kernfs_remove(struct kernfs_node *kn)
*/
void kernfs_remove(struct kernfs_node *kn)
{
- mutex_lock(&kernfs_mutex);
+ struct kernfs_root *root;
+
+ if (!kn)
+ return;
+
+ root = kernfs_root(kn);
+
+ down_write(&root->kernfs_rwsem);
__kernfs_remove(kn);
- mutex_unlock(&kernfs_mutex);
+ up_write(&root->kernfs_rwsem);
}
/**
@@ -1297,8 +1585,9 @@ void kernfs_break_active_protection(struct kernfs_node *kn)
* invoked before finishing the kernfs operation. Note that while this
* function restores the active reference, it doesn't and can't actually
* restore the active protection - @kn may already or be in the process of
- * being removed. Once kernfs_break_active_protection() is invoked, that
- * protection is irreversibly gone for the kernfs operation instance.
+ * being drained and removed. Once kernfs_break_active_protection() is
+ * invoked, that protection is irreversibly gone for the kernfs operation
+ * instance.
*
* While this function may be called at any point after
* kernfs_break_active_protection() is invoked, its most useful location
@@ -1345,22 +1634,25 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn)
* the whole kernfs_ops which won the arbitration. This can be used to
* guarantee, for example, all concurrent writes to a "delete" file to
* finish only after the whole operation is complete.
+ *
+ * Return: %true if @kn is removed by this call, otherwise %false.
*/
bool kernfs_remove_self(struct kernfs_node *kn)
{
bool ret;
+ struct kernfs_root *root = kernfs_root(kn);
- mutex_lock(&kernfs_mutex);
+ down_write(&root->kernfs_rwsem);
kernfs_break_active_protection(kn);
/*
* SUICIDAL is used to arbitrate among competing invocations. Only
* the first one will actually perform removal. When the removal
* is complete, SUICIDED is set and the active ref is restored
- * while holding kernfs_mutex. The ones which lost arbitration
- * waits for SUICDED && drained which can happen only after the
- * enclosing kernfs operation which executed the winning instance
- * of kernfs_remove_self() finished.
+ * while kernfs_rwsem for held exclusive. The ones which lost
+ * arbitration waits for SUICIDED && drained which can happen only
+ * after the enclosing kernfs operation which executed the winning
+ * instance of kernfs_remove_self() finished.
*/
if (!(kn->flags & KERNFS_SUICIDAL)) {
kn->flags |= KERNFS_SUICIDAL;
@@ -1378,9 +1670,9 @@ bool kernfs_remove_self(struct kernfs_node *kn)
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
break;
- mutex_unlock(&kernfs_mutex);
+ up_write(&root->kernfs_rwsem);
schedule();
- mutex_lock(&kernfs_mutex);
+ down_write(&root->kernfs_rwsem);
}
finish_wait(waitq, &wait);
WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
@@ -1388,12 +1680,12 @@ bool kernfs_remove_self(struct kernfs_node *kn)
}
/*
- * This must be done while holding kernfs_mutex; otherwise, waiting
- * for SUICIDED && deactivated could finish prematurely.
+ * This must be done while kernfs_rwsem held exclusive; otherwise,
+ * waiting for SUICIDED && deactivated could finish prematurely.
*/
kernfs_unbreak_active_protection(kn);
- mutex_unlock(&kernfs_mutex);
+ up_write(&root->kernfs_rwsem);
return ret;
}
@@ -1404,12 +1696,14 @@ bool kernfs_remove_self(struct kernfs_node *kn)
* @ns: namespace tag of the kernfs_node to remove
*
* Look for the kernfs_node with @name and @ns under @parent and remove it.
- * Returns 0 on success, -ENOENT if such entry doesn't exist.
+ *
+ * Return: %0 on success, -ENOENT if such entry doesn't exist.
*/
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
const void *ns)
{
struct kernfs_node *kn;
+ struct kernfs_root *root;
if (!parent) {
WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
@@ -1417,13 +1711,17 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
return -ENOENT;
}
- mutex_lock(&kernfs_mutex);
+ root = kernfs_root(parent);
+ down_write(&root->kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
- if (kn)
+ if (kn) {
+ kernfs_get(kn);
__kernfs_remove(kn);
+ kernfs_put(kn);
+ }
- mutex_unlock(&kernfs_mutex);
+ up_write(&root->kernfs_rwsem);
if (kn)
return 0;
@@ -1437,28 +1735,42 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
* @new_parent: new parent to put @sd under
* @new_name: new name
* @new_ns: new namespace tag
+ *
+ * Return: %0 on success, -errno on failure.
*/
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name, const void *new_ns)
{
struct kernfs_node *old_parent;
- const char *old_name = NULL;
+ struct kernfs_root *root;
+ const char *old_name;
int error;
/* can't move or rename root */
- if (!kn->parent)
+ if (!rcu_access_pointer(kn->__parent))
return -EINVAL;
- mutex_lock(&kernfs_mutex);
+ root = kernfs_root(kn);
+ down_write(&root->kernfs_rwsem);
error = -ENOENT;
if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
(new_parent->flags & KERNFS_EMPTY_DIR))
goto out;
+ old_parent = kernfs_parent(kn);
+ if (root->flags & KERNFS_ROOT_INVARIANT_PARENT) {
+ error = -EINVAL;
+ if (WARN_ON_ONCE(old_parent != new_parent))
+ goto out;
+ }
+
error = 0;
- if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
- (strcmp(kn->name, new_name) == 0))
+ old_name = kernfs_rcu_name(kn);
+ if (!new_name)
+ new_name = old_name;
+ if ((old_parent == new_parent) && (kn->ns == new_ns) &&
+ (strcmp(old_name, new_name) == 0))
goto out; /* nothing to rename */
error = -EEXIST;
@@ -1466,7 +1778,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
goto out;
/* rename kernfs_node */
- if (strcmp(kn->name, new_name) != 0) {
+ if (strcmp(old_name, new_name) != 0) {
error = -ENOMEM;
new_name = kstrdup_const(new_name, GFP_KERNEL);
if (!new_name)
@@ -1479,40 +1791,39 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
* Move to the appropriate place in the appropriate directories rbtree.
*/
kernfs_unlink_sibling(kn);
- kernfs_get(new_parent);
- /* rename_lock protects ->parent and ->name accessors */
- spin_lock_irq(&kernfs_rename_lock);
+ /* rename_lock protects ->parent accessors */
+ if (old_parent != new_parent) {
+ kernfs_get(new_parent);
+ write_lock_irq(&root->kernfs_rename_lock);
- old_parent = kn->parent;
- kn->parent = new_parent;
+ rcu_assign_pointer(kn->__parent, new_parent);
- kn->ns = new_ns;
- if (new_name) {
- old_name = kn->name;
- kn->name = new_name;
- }
+ kn->ns = new_ns;
+ if (new_name)
+ rcu_assign_pointer(kn->name, new_name);
- spin_unlock_irq(&kernfs_rename_lock);
+ write_unlock_irq(&root->kernfs_rename_lock);
+ kernfs_put(old_parent);
+ } else {
+ /* name assignment is RCU protected, parent is the same */
+ kn->ns = new_ns;
+ if (new_name)
+ rcu_assign_pointer(kn->name, new_name);
+ }
- kn->hash = kernfs_name_hash(kn->name, kn->ns);
+ kn->hash = kernfs_name_hash(new_name ?: old_name, kn->ns);
kernfs_link_sibling(kn);
- kernfs_put(old_parent);
- kfree_const(old_name);
+ if (new_name && !is_kernel_rodata((unsigned long)old_name))
+ kfree_rcu_mightsleep(old_name);
error = 0;
out:
- mutex_unlock(&kernfs_mutex);
+ up_write(&root->kernfs_rwsem);
return error;
}
-/* Relationship between s_mode and the DT_xxx types */
-static inline unsigned char dt_type(struct kernfs_node *kn)
-{
- return (kn->mode >> 12) & 15;
-}
-
static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
{
kernfs_put(filp->private_data);
@@ -1524,7 +1835,8 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
{
if (pos) {
int valid = kernfs_active(pos) &&
- pos->parent == parent && hash == pos->hash;
+ rcu_access_pointer(pos->__parent) == parent &&
+ hash == pos->hash;
kernfs_put(pos);
if (!valid)
pos = NULL;
@@ -1572,13 +1884,16 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
- struct kernfs_node *parent = dentry->d_fsdata;
+ struct kernfs_node *parent = kernfs_dentry_node(dentry);
struct kernfs_node *pos = file->private_data;
+ struct kernfs_root *root;
const void *ns = NULL;
if (!dir_emit_dots(file, ctx))
return 0;
- mutex_lock(&kernfs_mutex);
+
+ root = kernfs_root(parent);
+ down_read(&root->kernfs_rwsem);
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dentry->d_sb)->ns;
@@ -1586,21 +1901,21 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
pos;
pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
- const char *name = pos->name;
- unsigned int type = dt_type(pos);
+ const char *name = kernfs_rcu_name(pos);
+ unsigned int type = fs_umode_to_dtype(pos->mode);
int len = strlen(name);
- ino_t ino = pos->ino;
+ ino_t ino = kernfs_ino(pos);
ctx->pos = pos->hash;
file->private_data = pos;
kernfs_get(pos);
- mutex_unlock(&kernfs_mutex);
- if (!dir_emit(ctx, name, len, ino, type))
+ if (!dir_emit(ctx, name, len, ino, type)) {
+ up_read(&root->kernfs_rwsem);
return 0;
- mutex_lock(&kernfs_mutex);
+ }
}
- mutex_unlock(&kernfs_mutex);
+ up_read(&root->kernfs_rwsem);
file->private_data = NULL;
ctx->pos = INT_MAX;
return 0;