diff options
Diffstat (limited to 'fs/kernfs/dir.c')
| -rw-r--r-- | fs/kernfs/dir.c | 979 |
1 files changed, 647 insertions, 332 deletions
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index db5900aaa55a..5c0efd6b239f 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1,11 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/dir.c - kernfs directory implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> - * - * This file is released under the GPLv2. */ #include <linux/sched.h> @@ -18,18 +17,29 @@ #include "kernfs-internal.h" -DEFINE_MUTEX(kernfs_mutex); -static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ -static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ +/* + * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to + * call pr_cont() while holding rename_lock. Because sometimes pr_cont() + * will perform wakeups when releasing console_sem. Holding rename_lock + * will introduce deadlock if the scheduler reads the kernfs_name in the + * wakeup path. + */ +static DEFINE_SPINLOCK(kernfs_pr_cont_lock); +static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) -static bool kernfs_active(struct kernfs_node *kn) +static bool __kernfs_active(struct kernfs_node *kn) { - lockdep_assert_held(&kernfs_mutex); return atomic_read(&kn->active) >= 0; } +static bool kernfs_active(struct kernfs_node *kn) +{ + lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); + return __kernfs_active(kn); +} + static bool kernfs_lockdep(struct kernfs_node *kn) { #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -39,22 +49,14 @@ static bool kernfs_lockdep(struct kernfs_node *kn) #endif } -static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) -{ - if (!kn) - return strlcpy(buf, "(null)", buflen); - - return strlcpy(buf, kn->parent ? kn->name : "/", buflen); -} - /* kernfs_node_depth - compute depth from @from to @to */ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) { size_t depth = 0; - while (to->parent && to != from) { + while (rcu_dereference(to->__parent) && to != from) { depth++; - to = to->parent; + to = rcu_dereference(to->__parent); } return depth; } @@ -72,18 +74,18 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, db = kernfs_depth(rb->kn, b); while (da > db) { - a = a->parent; + a = rcu_dereference(a->__parent); da--; } while (db > da) { - b = b->parent; + b = rcu_dereference(b->__parent); db--; } /* worst case b and a will be the same at root */ while (b != a) { - b = b->parent; - a = a->parent; + b = rcu_dereference(b->__parent); + a = rcu_dereference(a->__parent); } return a; @@ -113,9 +115,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * - * [3] when @kn_to is NULL result will be "(null)" + * [3] when @kn_to is %NULL result will be "(null)" * - * Returns the length of the full path. If the full length is equal to or + * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ @@ -126,16 +128,17 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; + ssize_t copied; int i, j; if (!kn_to) - return strlcpy(buf, "(null)", buflen); + return strscpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) - return strlcpy(buf, "/", buflen); + return strscpy(buf, "/", buflen); common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) @@ -144,21 +147,24 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); - if (buf) - buf[0] = '\0'; + buf[0] = '\0'; - for (i = 0; i < depth_from; i++) - len += strlcpy(buf + len, parent_str, - len < buflen ? buflen - len : 0); + for (i = 0; i < depth_from; i++) { + copied = strscpy(buf + len, parent_str, buflen - len); + if (copied < 0) + return copied; + len += copied; + } /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { + const char *name; + for (kn = kn_to, j = 0; j < i; j++) - kn = kn->parent; - len += strlcpy(buf + len, "/", - len < buflen ? buflen - len : 0); - len += strlcpy(buf + len, kn->name, - len < buflen ? buflen - len : 0); + kn = rcu_dereference(kn->__parent); + + name = rcu_dereference(kn->name); + len += scnprintf(buf + len, buflen - len, "/%s", name); } return len; @@ -171,22 +177,29 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is - * similar to strlcpy(). It returns the length of @kn's name and if @buf - * isn't long enough, it's filled upto @buflen-1 and nul terminated. + * similar to strscpy(). + * + * Fills buffer with "(null)" if @kn is %NULL. * - * Fills buffer with "(null)" if @kn is NULL. + * Return: the resulting length of @buf. If @buf isn't long enough, + * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG. * * This function can be called from any context. */ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { - unsigned long flags; - int ret; + struct kernfs_node *kn_parent; - spin_lock_irqsave(&kernfs_rename_lock, flags); - ret = kernfs_name_locked(kn, buf, buflen); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); - return ret; + if (!kn) + return strscpy(buf, "(null)", buflen); + + guard(rcu)(); + /* + * KERNFS_ROOT_INVARIANT_PARENT is ignored here. The name is RCU freed and + * the parent is either existing or not. + */ + kn_parent = rcu_dereference(kn->__parent); + return strscpy(buf, kn_parent ? rcu_dereference(kn->name) : "/", buflen); } /** @@ -201,20 +214,24 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) * path (which includes '..'s) as needed to reach from @from to @to is * returned. * - * Returns the length of the full path. If the full length is equal to or + * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) { - unsigned long flags; - int ret; + struct kernfs_root *root; - spin_lock_irqsave(&kernfs_rename_lock, flags); - ret = kernfs_path_from_node_locked(to, from, buf, buflen); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); - return ret; + guard(rcu)(); + if (to) { + root = kernfs_root(to); + if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) { + guard(read_lock_irqsave)(&root->kernfs_rename_lock); + return kernfs_path_from_node_locked(to, from, buf, buflen); + } + } + return kernfs_path_from_node_locked(to, from, buf, buflen); } EXPORT_SYMBOL_GPL(kernfs_path_from_node); @@ -228,12 +245,12 @@ void pr_cont_kernfs_name(struct kernfs_node *kn) { unsigned long flags; - spin_lock_irqsave(&kernfs_rename_lock, flags); + spin_lock_irqsave(&kernfs_pr_cont_lock, flags); - kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); + kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); pr_cont("%s", kernfs_pr_cont_buf); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** @@ -247,24 +264,22 @@ void pr_cont_kernfs_path(struct kernfs_node *kn) unsigned long flags; int sz; - spin_lock_irqsave(&kernfs_rename_lock, flags); + spin_lock_irqsave(&kernfs_pr_cont_lock, flags); - sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf, - sizeof(kernfs_pr_cont_buf)); + sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, + sizeof(kernfs_pr_cont_buf)); if (sz < 0) { - pr_cont("(error)"); - goto out; - } - - if (sz >= sizeof(kernfs_pr_cont_buf)) { - pr_cont("(name too long)"); + if (sz == -E2BIG) + pr_cont("(name too long)"); + else + pr_cont("(error)"); goto out; } pr_cont("%s", kernfs_pr_cont_buf); out: - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** @@ -273,26 +288,30 @@ out: * * Determines @kn's parent, pins and returns it. This function can be * called from any context. + * + * Return: parent node of @kn */ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { struct kernfs_node *parent; + struct kernfs_root *root; unsigned long flags; - spin_lock_irqsave(&kernfs_rename_lock, flags); - parent = kn->parent; + root = kernfs_root(kn); + read_lock_irqsave(&root->kernfs_rename_lock, flags); + parent = kernfs_parent(kn); kernfs_get(parent); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + read_unlock_irqrestore(&root->kernfs_rename_lock, flags); return parent; } /** - * kernfs_name_hash + * kernfs_name_hash - calculate hash of @ns + @name * @name: Null terminated string to hash * @ns: Namespace tag to hash * - * Returns 31 bit hash of ns + name (so it fits in an off_t ) + * Return: 31-bit hash of ns + name (so it fits in an off_t) */ static unsigned int kernfs_name_hash(const char *name, const void *ns) { @@ -321,13 +340,13 @@ static int kernfs_name_compare(unsigned int hash, const char *name, return -1; if (ns > kn->ns) return 1; - return strcmp(name, kn->name); + return strcmp(name, kernfs_rcu_name(kn)); } static int kernfs_sd_compare(const struct kernfs_node *left, const struct kernfs_node *right) { - return kernfs_name_compare(left->hash, left->name, left->ns, right); + return kernfs_name_compare(left->hash, kernfs_rcu_name(left), left->ns, right); } /** @@ -338,15 +357,19 @@ static int kernfs_sd_compare(const struct kernfs_node *left, * @kn->parent->dir.children. * * Locking: - * mutex_lock(kernfs_mutex) + * kernfs_rwsem held exclusive * - * RETURNS: - * 0 on susccess -EEXIST on failure. + * Return: + * %0 on success, -EEXIST on failure. */ static int kernfs_link_sibling(struct kernfs_node *kn) { - struct rb_node **node = &kn->parent->dir.children.rb_node; struct rb_node *parent = NULL; + struct kernfs_node *kn_parent; + struct rb_node **node; + + kn_parent = kernfs_parent(kn); + node = &kn_parent->dir.children.rb_node; while (*node) { struct kernfs_node *pos; @@ -365,11 +388,14 @@ static int kernfs_link_sibling(struct kernfs_node *kn) /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); - rb_insert_color(&kn->rb, &kn->parent->dir.children); + rb_insert_color(&kn->rb, &kn_parent->dir.children); /* successfully added, account subdir number */ + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) - kn->parent->dir.subdirs++; + kn_parent->dir.subdirs++; + kernfs_inc_rev(kn_parent); + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); return 0; } @@ -379,21 +405,29 @@ static int kernfs_link_sibling(struct kernfs_node *kn) * @kn: kernfs_node of interest * * Try to unlink @kn from its sibling rbtree which starts from - * kn->parent->dir.children. Returns %true if @kn was actually - * removed, %false if @kn wasn't on the rbtree. + * kn->parent->dir.children. + * + * Return: %true if @kn was actually removed, + * %false if @kn wasn't on the rbtree. * * Locking: - * mutex_lock(kernfs_mutex) + * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { + struct kernfs_node *kn_parent; + if (RB_EMPTY_NODE(&kn->rb)) return false; + kn_parent = kernfs_parent(kn); + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) - kn->parent->dir.subdirs--; + kn_parent->dir.subdirs--; + kernfs_inc_rev(kn_parent); + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); - rb_erase(&kn->rb, &kn->parent->dir.children); + rb_erase(&kn->rb, &kn_parent->dir.children); RB_CLEAR_NODE(&kn->rb); return true; } @@ -403,10 +437,10 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn) * @kn: kernfs_node to get an active reference to * * Get an active reference of @kn. This function is noop if @kn - * is NULL. + * is %NULL. * - * RETURNS: - * Pointer to @kn on success, NULL on failure. + * Return: + * Pointer to @kn on success, %NULL on failure. */ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) { @@ -426,42 +460,52 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) * @kn: kernfs_node to put an active reference to * * Put an active reference to @kn. This function is noop if @kn - * is NULL. + * is %NULL. */ void kernfs_put_active(struct kernfs_node *kn) { - struct kernfs_root *root = kernfs_root(kn); int v; if (unlikely(!kn)) return; if (kernfs_lockdep(kn)) - rwsem_release(&kn->dep_map, 1, _RET_IP_); + rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; - wake_up_all(&root->deactivate_waitq); + wake_up_all(&kernfs_root(kn)->deactivate_waitq); } /** * kernfs_drain - drain kernfs_node * @kn: kernfs_node to drain * - * Drain existing usages and nuke all existing mmaps of @kn. Mutiple + * Drain existing usages and nuke all existing mmaps of @kn. Multiple * removers may invoke this function concurrently on @kn and all will * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) - __releases(&kernfs_mutex) __acquires(&kernfs_mutex) + __releases(&kernfs_root(kn)->kernfs_rwsem) + __acquires(&kernfs_root(kn)->kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); - mutex_unlock(&kernfs_mutex); + /* + * Skip draining if already fully drained. This avoids draining and its + * lockdep annotations for nodes which have never been activated + * allowing embedding kernfs_remove() in create error paths without + * worrying about draining. + */ + if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && + !kernfs_should_drain_open_files(kn)) + return; + + up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); @@ -469,18 +513,18 @@ static void kernfs_drain(struct kernfs_node *kn) lock_contended(&kn->dep_map, _RET_IP_); } - /* but everyone should wait for draining */ wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); - rwsem_release(&kn->dep_map, 1, _RET_IP_); + rwsem_release(&kn->dep_map, _RET_IP_); } - kernfs_drain_open_files(kn); + if (kernfs_should_drain_open_files(kn)) + kernfs_drain_open_files(kn); - mutex_lock(&kernfs_mutex); + down_write(&root->kernfs_rwsem); } /** @@ -496,6 +540,21 @@ void kernfs_get(struct kernfs_node *kn) } EXPORT_SYMBOL_GPL(kernfs_get); +static void kernfs_free_rcu(struct rcu_head *rcu) +{ + struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); + + /* If the whole node goes away, then name can't be used outside */ + kfree_const(rcu_access_pointer(kn->name)); + + if (kn->iattr) { + simple_xattrs_free(&kn->iattr->xattrs, NULL); + kmem_cache_free(kernfs_iattrs_cache, kn->iattr); + } + + kmem_cache_free(kernfs_node_cache, kn); +} + /** * kernfs_put - put a reference count on a kernfs_node * @kn: the target kernfs_node @@ -515,26 +574,21 @@ void kernfs_put(struct kernfs_node *kn) * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ - parent = kn->parent; + parent = kernfs_parent(kn); WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, "kernfs_put: %s/%s: released with incorrect active_ref %d\n", - parent ? parent->name : "", kn->name, atomic_read(&kn->active)); + parent ? rcu_dereference(parent->name) : "", + rcu_dereference(kn->name), atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); - kfree_const(kn->name); + spin_lock(&root->kernfs_idr_lock); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); + spin_unlock(&root->kernfs_idr_lock); - if (kn->iattr) { - if (kn->iattr->ia_secdata) - security_release_secctx(kn->iattr->ia_secdata, - kn->iattr->ia_secdata_len); - simple_xattrs_free(&kn->iattr->xattrs); - } - kfree(kn->iattr); - ida_simple_remove(&root->ino_ida, kn->ino); - kmem_cache_free(kernfs_node_cache, kn); + call_rcu(&kn->rcu, kernfs_free_rcu); kn = parent; if (kn) { @@ -542,66 +596,17 @@ void kernfs_put(struct kernfs_node *kn) goto repeat; } else { /* just released the root kn, free @root too */ - ida_destroy(&root->ino_ida); - kfree(root); + idr_destroy(&root->ino_idr); + kfree_rcu(root, rcu); } } EXPORT_SYMBOL_GPL(kernfs_put); -static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct kernfs_node *kn; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - /* Always perform fresh lookup for negatives */ - if (d_really_is_negative(dentry)) - goto out_bad_unlocked; - - kn = dentry->d_fsdata; - mutex_lock(&kernfs_mutex); - - /* The kernfs node has been deactivated */ - if (!kernfs_active(kn)) - goto out_bad; - - /* The kernfs node has been moved? */ - if (dentry->d_parent->d_fsdata != kn->parent) - goto out_bad; - - /* The kernfs node has been renamed */ - if (strcmp(dentry->d_name.name, kn->name) != 0) - goto out_bad; - - /* The kernfs node has been moved to a different namespace */ - if (kn->parent && kernfs_ns_enabled(kn->parent) && - kernfs_info(dentry->d_sb)->ns != kn->ns) - goto out_bad; - - mutex_unlock(&kernfs_mutex); - return 1; -out_bad: - mutex_unlock(&kernfs_mutex); -out_bad_unlocked: - return 0; -} - -static void kernfs_dop_release(struct dentry *dentry) -{ - kernfs_put(dentry->d_fsdata); -} - -const struct dentry_operations kernfs_dops = { - .d_revalidate = kernfs_dop_revalidate, - .d_release = kernfs_dop_release, -}; - /** * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question * - * Return the kernfs_node associated with @dentry. If @dentry is not a + * Return: the kernfs_node associated with @dentry. If @dentry is not a * kernfs one, %NULL is returned. * * While the returned kernfs_node will stay accessible as long as @dentry @@ -611,15 +616,18 @@ const struct dentry_operations kernfs_dops = { struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { if (dentry->d_sb->s_op == &kernfs_sops) - return dentry->d_fsdata; + return kernfs_dentry_node(dentry); return NULL; } static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, + struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; + u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); @@ -630,21 +638,55 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + idr_preload(GFP_KERNEL); + spin_lock(&root->kernfs_idr_lock); + ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); + if (ret >= 0 && ret < root->last_id_lowbits) + root->id_highbits++; + id_highbits = root->id_highbits; + root->last_id_lowbits = ret; + spin_unlock(&root->kernfs_idr_lock); + idr_preload_end(); if (ret < 0) goto err_out2; - kn->ino = ret; + + kn->id = (u64)id_highbits << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); - kn->name = name; + rcu_assign_pointer(kn->name, name); kn->mode = mode; kn->flags = flags; + if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { + struct iattr iattr = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = uid, + .ia_gid = gid, + }; + + ret = __kernfs_setattr(kn, &iattr); + if (ret < 0) + goto err_out3; + } + + if (parent) { + ret = security_kernfs_init_security(parent, kn); + if (ret) + goto err_out4; + } + return kn; + err_out4: + simple_xattrs_free(&kn->iattr->xattrs, NULL); + kmem_cache_free(kernfs_iattrs_cache, kn->iattr); + err_out3: + spin_lock(&root->kernfs_idr_lock); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); + spin_unlock(&root->kernfs_idr_lock); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -654,18 +696,81 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; - kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags); + if (parent->mode & S_ISGID) { + /* this code block imitates inode_init_owner() for + * kernfs + */ + + if (parent->iattr) + gid = parent->iattr->ia_gid; + + if (flags & KERNFS_DIR) + mode |= S_ISGID; + } + + kn = __kernfs_new_node(kernfs_root(parent), parent, + name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); - kn->parent = parent; + rcu_assign_pointer(kn->__parent, parent); } return kn; } +/* + * kernfs_find_and_get_node_by_id - get kernfs_node from node id + * @root: the kernfs root + * @id: the target node id + * + * @id's lower 32bits encode ino and upper gen. If the gen portion is + * zero, all generations are matched. + * + * Return: %NULL on failure, + * otherwise a kernfs node with reference counter incremented. + */ +struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, + u64 id) +{ + struct kernfs_node *kn; + ino_t ino = kernfs_id_ino(id); + u32 gen = kernfs_id_gen(id); + + rcu_read_lock(); + + kn = idr_find(&root->ino_idr, (u32)ino); + if (!kn) + goto err_unlock; + + if (sizeof(ino_t) >= sizeof(u64)) { + /* we looked up with the low 32bits, compare the whole */ + if (kernfs_ino(kn) != ino) + goto err_unlock; + } else { + /* 0 matches all generations */ + if (unlikely(gen && kernfs_gen(kn) != gen)) + goto err_unlock; + } + + /* + * We should fail if @kn has never been activated and guarantee success + * if the caller knows that @kn is active. Both can be achieved by + * __kernfs_active() which tests @kn->active without kernfs_rwsem. + */ + if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) + goto err_unlock; + + rcu_read_unlock(); + return kn; +err_unlock: + rcu_read_unlock(); + return NULL; +} + /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added @@ -674,50 +779,52 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * - * RETURNS: - * 0 on success, -EEXIST if entry with the given name already + * Return: + * %0 on success, -EEXIST if entry with the given name already * exists. */ int kernfs_add_one(struct kernfs_node *kn) { - struct kernfs_node *parent = kn->parent; + struct kernfs_root *root = kernfs_root(kn); struct kernfs_iattrs *ps_iattr; + struct kernfs_node *parent; bool has_ns; int ret; - mutex_lock(&kernfs_mutex); + down_write(&root->kernfs_rwsem); + parent = kernfs_parent(kn); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", - has_ns ? "required" : "invalid", parent->name, kn->name)) + has_ns ? "required" : "invalid", + kernfs_rcu_name(parent), kernfs_rcu_name(kn))) goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) goto out_unlock; ret = -ENOENT; - if (parent->flags & KERNFS_EMPTY_DIR) - goto out_unlock; - - if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) + if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; - kn->hash = kernfs_name_hash(kn->name, kn->ns); + kn->hash = kernfs_name_hash(kernfs_rcu_name(kn), kn->ns); ret = kernfs_link_sibling(kn); if (ret) goto out_unlock; /* Update timestamps on the parent */ + down_write(&root->kernfs_iattr_rwsem); + ps_iattr = parent->iattr; if (ps_iattr) { - struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ktime_get_real_ts(&ps_iattrs->ia_ctime); - ps_iattrs->ia_mtime = ps_iattrs->ia_ctime; + ktime_get_real_ts64(&ps_iattr->ia_ctime); + ps_iattr->ia_mtime = ps_iattr->ia_ctime; } - mutex_unlock(&kernfs_mutex); + up_write(&root->kernfs_iattr_rwsem); + up_write(&root->kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. @@ -731,7 +838,7 @@ int kernfs_add_one(struct kernfs_node *kn) return 0; out_unlock: - mutex_unlock(&kernfs_mutex); + up_write(&root->kernfs_rwsem); return ret; } @@ -741,8 +848,9 @@ out_unlock: * @name: name to look for * @ns: the namespace tag to use * - * Look for kernfs_node with name @name under @parent. Returns pointer to - * the found kernfs_node on success, %NULL on failure. + * Look for kernfs_node with name @name under @parent. + * + * Return: pointer to the found kernfs_node on success, %NULL on failure. */ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, const unsigned char *name, @@ -752,11 +860,11 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", - has_ns ? "required" : "invalid", parent->name, name); + has_ns ? "required" : "invalid", kernfs_rcu_name(parent), name); return NULL; } @@ -781,18 +889,17 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const void *ns) { - size_t len; + ssize_t len; char *p, *name; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); - /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */ - spin_lock_irq(&kernfs_rename_lock); + spin_lock_irq(&kernfs_pr_cont_lock); - len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); + len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); - if (len >= sizeof(kernfs_pr_cont_buf)) { - spin_unlock_irq(&kernfs_rename_lock); + if (len < 0) { + spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } @@ -804,7 +911,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, parent = kernfs_find_ns(parent, name, ns); } - spin_unlock_irq(&kernfs_rename_lock); + spin_unlock_irq(&kernfs_pr_cont_lock); return parent; } @@ -816,18 +923,20 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent and get a reference - * if found. This function may sleep and returns pointer to the found - * kernfs_node on success, %NULL on failure. + * if found. This function may sleep. + * + * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root = kernfs_root(parent); - mutex_lock(&kernfs_mutex); + down_read(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); - mutex_unlock(&kernfs_mutex); + up_read(&root->kernfs_rwsem); return kn; } @@ -840,29 +949,36 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); * @ns: the namespace tag to use * * Look for kernfs_node with path @path under @parent and get a reference - * if found. This function may sleep and returns pointer to the found - * kernfs_node on success, %NULL on failure. + * if found. This function may sleep. + * + * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root = kernfs_root(parent); - mutex_lock(&kernfs_mutex); + down_read(&root->kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); - mutex_unlock(&kernfs_mutex); + up_read(&root->kernfs_rwsem); return kn; } +unsigned int kernfs_root_flags(struct kernfs_node *kn) +{ + return kernfs_root(kn)->flags; +} + /** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * - * Returns the root of the new hierarchy on success, ERR_PTR() value on + * Return: the root of the new hierarchy on success, ERR_PTR() value on * failure. */ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, @@ -875,13 +991,30 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, if (!root) return ERR_PTR(-ENOMEM); - ida_init(&root->ino_ida); + idr_init(&root->ino_idr); + spin_lock_init(&root->kernfs_idr_lock); + init_rwsem(&root->kernfs_rwsem); + init_rwsem(&root->kernfs_iattr_rwsem); + init_rwsem(&root->kernfs_supers_rwsem); INIT_LIST_HEAD(&root->supers); + rwlock_init(&root->kernfs_rename_lock); - kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, + /* + * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. + * High bits generation. The starting value for both ino and + * genenration is 1. Initialize upper 32bit allocation + * accordingly. + */ + if (sizeof(ino_t) >= sizeof(u64)) + root->id_highbits = 0; + else + root->id_highbits = 1; + + kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) { - ida_destroy(&root->ino_ida); + idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } @@ -909,7 +1042,24 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, */ void kernfs_destroy_root(struct kernfs_root *root) { - kernfs_remove(root->kn); /* will also free @root */ + /* + * kernfs_remove holds kernfs_rwsem from the root so the root + * shouldn't be freed during the operation. + */ + kernfs_get(root->kn); + kernfs_remove(root->kn); + kernfs_put(root->kn); /* will also free @root */ +} + +/** + * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root + * @root: root to use to lookup + * + * Return: @root's kernfs_node + */ +struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) +{ + return root->kn; } /** @@ -917,20 +1067,24 @@ void kernfs_destroy_root(struct kernfs_root *root) * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory + * @uid: uid of the new directory + * @gid: gid of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * - * Returns the created node on success, ERR_PTR() value on failure. + * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, void *priv, const void *ns) { struct kernfs_node *kn; int rc; /* allocate */ - kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR); + kn = kernfs_new_node(parent, name, mode | S_IFDIR, + uid, gid, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); @@ -952,7 +1106,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, * @parent: parent in which to create a new directory * @name: name of the new directory * - * Returns the created node on success, ERR_PTR() value on failure. + * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name) @@ -961,7 +1115,8 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, int rc; /* allocate */ - kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR); + kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); @@ -979,67 +1134,149 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, return ERR_PTR(rc); } +static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) +{ + struct kernfs_node *kn, *parent; + struct kernfs_root *root; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* Negative hashed dentry? */ + if (d_really_is_negative(dentry)) { + /* If the kernfs parent node has changed discard and + * proceed to ->lookup. + * + * There's nothing special needed here when getting the + * dentry parent, even if a concurrent rename is in + * progress. That's because the dentry is negative so + * it can only be the target of the rename and it will + * be doing a d_move() not a replace. Consequently the + * dentry d_parent won't change over the d_move(). + * + * Also kernfs negative dentries transitioning from + * negative to positive during revalidate won't happen + * because they are invalidated on containing directory + * changes and the lookup re-done so that a new positive + * dentry can be properly created. + */ + root = kernfs_root_from_sb(dentry->d_sb); + down_read(&root->kernfs_rwsem); + parent = kernfs_dentry_node(dentry->d_parent); + if (parent) { + if (kernfs_dir_changed(parent, dentry)) { + up_read(&root->kernfs_rwsem); + return 0; + } + } + up_read(&root->kernfs_rwsem); + + /* The kernfs parent node hasn't changed, leave the + * dentry negative and return success. + */ + return 1; + } + + kn = kernfs_dentry_node(dentry); + root = kernfs_root(kn); + down_read(&root->kernfs_rwsem); + + /* The kernfs node has been deactivated */ + if (!kernfs_active(kn)) + goto out_bad; + + parent = kernfs_parent(kn); + /* The kernfs node has been moved? */ + if (kernfs_dentry_node(dentry->d_parent) != parent) + goto out_bad; + + /* The kernfs node has been renamed */ + if (strcmp(dentry->d_name.name, kernfs_rcu_name(kn)) != 0) + goto out_bad; + + /* The kernfs node has been moved to a different namespace */ + if (parent && kernfs_ns_enabled(parent) && + kernfs_info(dentry->d_sb)->ns != kn->ns) + goto out_bad; + + up_read(&root->kernfs_rwsem); + return 1; +out_bad: + up_read(&root->kernfs_rwsem); + return 0; +} + +const struct dentry_operations kernfs_dops = { + .d_revalidate = kernfs_dop_revalidate, +}; + static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct dentry *ret; - struct kernfs_node *parent = dentry->d_parent->d_fsdata; + struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; - struct inode *inode; + struct kernfs_root *root; + struct inode *inode = NULL; const void *ns = NULL; - mutex_lock(&kernfs_mutex); - + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; kn = kernfs_find_ns(parent, dentry->d_name.name, ns); - - /* no such entry */ - if (!kn || !kernfs_active(kn)) { - ret = NULL; - goto out_unlock; - } - kernfs_get(kn); - dentry->d_fsdata = kn; - /* attach dentry and inode */ - inode = kernfs_get_inode(dir->i_sb, kn); - if (!inode) { - ret = ERR_PTR(-ENOMEM); - goto out_unlock; + if (kn) { + /* Inactive nodes are invisible to the VFS so don't + * create a negative. + */ + if (!kernfs_active(kn)) { + up_read(&root->kernfs_rwsem); + return NULL; + } + inode = kernfs_get_inode(dir->i_sb, kn); + if (!inode) + inode = ERR_PTR(-ENOMEM); } + /* + * Needed for negative dentry validation. + * The negative dentry can be created in kernfs_iop_lookup() + * or transforms from positive dentry in dentry_unlink_inode() + * called from vfs_rmdir(). + */ + if (!IS_ERR(inode)) + kernfs_set_rev(parent, dentry); + up_read(&root->kernfs_rwsem); - /* instantiate and hash dentry */ - ret = d_splice_alias(inode, dentry); - out_unlock: - mutex_unlock(&kernfs_mutex); - return ret; + /* instantiate and hash (possibly negative) dentry */ + return d_splice_alias(inode, dentry); } -static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, - umode_t mode) +static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + umode_t mode) { struct kernfs_node *parent = dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; int ret; if (!scops || !scops->mkdir) - return -EPERM; + return ERR_PTR(-EPERM); if (!kernfs_get_active(parent)) - return -ENODEV; + return ERR_PTR(-ENODEV); ret = scops->mkdir(parent, dentry->d_name.name, mode); kernfs_put_active(parent); - return ret; + return ERR_PTR(ret); } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; @@ -1055,11 +1292,12 @@ static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) return ret; } -static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, +static int kernfs_iop_rename(struct mnt_idmap *idmap, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct kernfs_node *kn = old_dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; @@ -1127,13 +1365,15 @@ static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) * Find the next descendant to visit for post-order traversal of @root's * descendants. @root is included in the iteration and the last node to be * visited. + * + * Return: the next descendant to visit or %NULL when done. */ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, struct kernfs_node *root) { struct rb_node *rbn; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) @@ -1149,7 +1389,22 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, return kernfs_leftmost_descendant(rb_to_kn(rbn)); /* no sibling left, visit parent */ - return pos->parent; + return kernfs_parent(pos); +} + +static void kernfs_activate_one(struct kernfs_node *kn) +{ + lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); + + kn->flags |= KERNFS_ACTIVATED; + + if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) + return; + + WARN_ON_ONCE(rcu_access_pointer(kn->__parent) && RB_EMPTY_NODE(&kn->rb)); + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); } /** @@ -1168,84 +1423,110 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; + struct kernfs_root *root = kernfs_root(kn); - mutex_lock(&kernfs_mutex); + down_write(&root->kernfs_rwsem); pos = NULL; - while ((pos = kernfs_next_descendant_post(pos, kn))) { - if (!pos || (pos->flags & KERNFS_ACTIVATED)) - continue; + while ((pos = kernfs_next_descendant_post(pos, kn))) + kernfs_activate_one(pos); + + up_write(&root->kernfs_rwsem); +} + +/** + * kernfs_show - show or hide a node + * @kn: kernfs_node to show or hide + * @show: whether to show or hide + * + * If @show is %false, @kn is marked hidden and deactivated. A hidden node is + * ignored in future activaitons. If %true, the mark is removed and activation + * state is restored. This function won't implicitly activate a new node in a + * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet. + * + * To avoid recursion complexities, directories aren't supported for now. + */ +void kernfs_show(struct kernfs_node *kn, bool show) +{ + struct kernfs_root *root = kernfs_root(kn); - WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); - WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS); + if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR)) + return; - atomic_sub(KN_DEACTIVATED_BIAS, &pos->active); - pos->flags |= KERNFS_ACTIVATED; + down_write(&root->kernfs_rwsem); + + if (show) { + kn->flags &= ~KERNFS_HIDDEN; + if (kn->flags & KERNFS_ACTIVATED) + kernfs_activate_one(kn); + } else { + kn->flags |= KERNFS_HIDDEN; + if (kernfs_active(kn)) + atomic_add(KN_DEACTIVATED_BIAS, &kn->active); + kernfs_drain(kn); } - mutex_unlock(&kernfs_mutex); + up_write(&root->kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { - struct kernfs_node *pos; + struct kernfs_node *pos, *parent; - lockdep_assert_held(&kernfs_mutex); + /* Short-circuit if non-root @kn has already finished removal. */ + if (!kn) + return; + + lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); /* - * Short-circuit if non-root @kn has already finished removal. * This is for kernfs_remove_self() which plays with active ref * after removal. */ - if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb))) + if (kernfs_parent(kn) && RB_EMPTY_NODE(&kn->rb)) return; - pr_debug("kernfs %s: removing\n", kn->name); + pr_debug("kernfs %s: removing\n", kernfs_rcu_name(kn)); - /* prevent any new usage under @kn by deactivating all nodes */ + /* prevent new usage by marking all nodes removing and deactivating */ pos = NULL; - while ((pos = kernfs_next_descendant_post(pos, kn))) + while ((pos = kernfs_next_descendant_post(pos, kn))) { + pos->flags |= KERNFS_REMOVING; if (kernfs_active(pos)) atomic_add(KN_DEACTIVATED_BIAS, &pos->active); + } /* deactivate and unlink the subtree node-by-node */ do { pos = kernfs_leftmost_descendant(kn); /* - * kernfs_drain() drops kernfs_mutex temporarily and @pos's + * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. */ kernfs_get(pos); - /* - * Drain iff @kn was activated. This avoids draining and - * its lockdep annotations for nodes which have never been - * activated and allows embedding kernfs_remove() in create - * error paths without worrying about draining. - */ - if (kn->flags & KERNFS_ACTIVATED) - kernfs_drain(pos); - else - WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); - + kernfs_drain(pos); + parent = kernfs_parent(pos); /* * kernfs_unlink_sibling() succeeds once per node. Use it * to decide who's responsible for cleanups. */ - if (!pos->parent || kernfs_unlink_sibling(pos)) { + if (!parent || kernfs_unlink_sibling(pos)) { struct kernfs_iattrs *ps_iattr = - pos->parent ? pos->parent->iattr : NULL; + parent ? parent->iattr : NULL; /* update timestamps on the parent */ + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); + if (ps_iattr) { - ktime_get_real_ts(&ps_iattr->ia_iattr.ia_ctime); - ps_iattr->ia_iattr.ia_mtime = - ps_iattr->ia_iattr.ia_ctime; + ktime_get_real_ts64(&ps_iattr->ia_ctime); + ps_iattr->ia_mtime = ps_iattr->ia_ctime; } + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); kernfs_put(pos); } @@ -1261,9 +1542,16 @@ static void __kernfs_remove(struct kernfs_node *kn) */ void kernfs_remove(struct kernfs_node *kn) { - mutex_lock(&kernfs_mutex); + struct kernfs_root *root; + + if (!kn) + return; + + root = kernfs_root(kn); + + down_write(&root->kernfs_rwsem); __kernfs_remove(kn); - mutex_unlock(&kernfs_mutex); + up_write(&root->kernfs_rwsem); } /** @@ -1297,8 +1585,9 @@ void kernfs_break_active_protection(struct kernfs_node *kn) * invoked before finishing the kernfs operation. Note that while this * function restores the active reference, it doesn't and can't actually * restore the active protection - @kn may already or be in the process of - * being removed. Once kernfs_break_active_protection() is invoked, that - * protection is irreversibly gone for the kernfs operation instance. + * being drained and removed. Once kernfs_break_active_protection() is + * invoked, that protection is irreversibly gone for the kernfs operation + * instance. * * While this function may be called at any point after * kernfs_break_active_protection() is invoked, its most useful location @@ -1345,22 +1634,25 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn) * the whole kernfs_ops which won the arbitration. This can be used to * guarantee, for example, all concurrent writes to a "delete" file to * finish only after the whole operation is complete. + * + * Return: %true if @kn is removed by this call, otherwise %false. */ bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; + struct kernfs_root *root = kernfs_root(kn); - mutex_lock(&kernfs_mutex); + down_write(&root->kernfs_rwsem); kernfs_break_active_protection(kn); /* * SUICIDAL is used to arbitrate among competing invocations. Only * the first one will actually perform removal. When the removal * is complete, SUICIDED is set and the active ref is restored - * while holding kernfs_mutex. The ones which lost arbitration - * waits for SUICDED && drained which can happen only after the - * enclosing kernfs operation which executed the winning instance - * of kernfs_remove_self() finished. + * while kernfs_rwsem for held exclusive. The ones which lost + * arbitration waits for SUICIDED && drained which can happen only + * after the enclosing kernfs operation which executed the winning + * instance of kernfs_remove_self() finished. */ if (!(kn->flags & KERNFS_SUICIDAL)) { kn->flags |= KERNFS_SUICIDAL; @@ -1378,9 +1670,9 @@ bool kernfs_remove_self(struct kernfs_node *kn) atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; - mutex_unlock(&kernfs_mutex); + up_write(&root->kernfs_rwsem); schedule(); - mutex_lock(&kernfs_mutex); + down_write(&root->kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); @@ -1388,12 +1680,12 @@ bool kernfs_remove_self(struct kernfs_node *kn) } /* - * This must be done while holding kernfs_mutex; otherwise, waiting - * for SUICIDED && deactivated could finish prematurely. + * This must be done while kernfs_rwsem held exclusive; otherwise, + * waiting for SUICIDED && deactivated could finish prematurely. */ kernfs_unbreak_active_protection(kn); - mutex_unlock(&kernfs_mutex); + up_write(&root->kernfs_rwsem); return ret; } @@ -1404,12 +1696,14 @@ bool kernfs_remove_self(struct kernfs_node *kn) * @ns: namespace tag of the kernfs_node to remove * * Look for the kernfs_node with @name and @ns under @parent and remove it. - * Returns 0 on success, -ENOENT if such entry doesn't exist. + * + * Return: %0 on success, -ENOENT if such entry doesn't exist. */ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root; if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", @@ -1417,13 +1711,17 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, return -ENOENT; } - mutex_lock(&kernfs_mutex); + root = kernfs_root(parent); + down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); - if (kn) + if (kn) { + kernfs_get(kn); __kernfs_remove(kn); + kernfs_put(kn); + } - mutex_unlock(&kernfs_mutex); + up_write(&root->kernfs_rwsem); if (kn) return 0; @@ -1437,28 +1735,42 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, * @new_parent: new parent to put @sd under * @new_name: new name * @new_ns: new namespace tag + * + * Return: %0 on success, -errno on failure. */ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { struct kernfs_node *old_parent; - const char *old_name = NULL; + struct kernfs_root *root; + const char *old_name; int error; /* can't move or rename root */ - if (!kn->parent) + if (!rcu_access_pointer(kn->__parent)) return -EINVAL; - mutex_lock(&kernfs_mutex); + root = kernfs_root(kn); + down_write(&root->kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; + old_parent = kernfs_parent(kn); + if (root->flags & KERNFS_ROOT_INVARIANT_PARENT) { + error = -EINVAL; + if (WARN_ON_ONCE(old_parent != new_parent)) + goto out; + } + error = 0; - if ((kn->parent == new_parent) && (kn->ns == new_ns) && - (strcmp(kn->name, new_name) == 0)) + old_name = kernfs_rcu_name(kn); + if (!new_name) + new_name = old_name; + if ((old_parent == new_parent) && (kn->ns == new_ns) && + (strcmp(old_name, new_name) == 0)) goto out; /* nothing to rename */ error = -EEXIST; @@ -1466,7 +1778,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, goto out; /* rename kernfs_node */ - if (strcmp(kn->name, new_name) != 0) { + if (strcmp(old_name, new_name) != 0) { error = -ENOMEM; new_name = kstrdup_const(new_name, GFP_KERNEL); if (!new_name) @@ -1479,40 +1791,39 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, * Move to the appropriate place in the appropriate directories rbtree. */ kernfs_unlink_sibling(kn); - kernfs_get(new_parent); - /* rename_lock protects ->parent and ->name accessors */ - spin_lock_irq(&kernfs_rename_lock); + /* rename_lock protects ->parent accessors */ + if (old_parent != new_parent) { + kernfs_get(new_parent); + write_lock_irq(&root->kernfs_rename_lock); - old_parent = kn->parent; - kn->parent = new_parent; + rcu_assign_pointer(kn->__parent, new_parent); - kn->ns = new_ns; - if (new_name) { - old_name = kn->name; - kn->name = new_name; - } + kn->ns = new_ns; + if (new_name) + rcu_assign_pointer(kn->name, new_name); - spin_unlock_irq(&kernfs_rename_lock); + write_unlock_irq(&root->kernfs_rename_lock); + kernfs_put(old_parent); + } else { + /* name assignment is RCU protected, parent is the same */ + kn->ns = new_ns; + if (new_name) + rcu_assign_pointer(kn->name, new_name); + } - kn->hash = kernfs_name_hash(kn->name, kn->ns); + kn->hash = kernfs_name_hash(new_name ?: old_name, kn->ns); kernfs_link_sibling(kn); - kernfs_put(old_parent); - kfree_const(old_name); + if (new_name && !is_kernel_rodata((unsigned long)old_name)) + kfree_rcu_mightsleep(old_name); error = 0; out: - mutex_unlock(&kernfs_mutex); + up_write(&root->kernfs_rwsem); return error; } -/* Relationship between s_mode and the DT_xxx types */ -static inline unsigned char dt_type(struct kernfs_node *kn) -{ - return (kn->mode >> 12) & 15; -} - static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) { kernfs_put(filp->private_data); @@ -1524,7 +1835,8 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns, { if (pos) { int valid = kernfs_active(pos) && - pos->parent == parent && hash == pos->hash; + rcu_access_pointer(pos->__parent) == parent && + hash == pos->hash; kernfs_put(pos); if (!valid) pos = NULL; @@ -1572,13 +1884,16 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns, static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct kernfs_node *parent = dentry->d_fsdata; + struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; + struct kernfs_root *root; const void *ns = NULL; if (!dir_emit_dots(file, ctx)) return 0; - mutex_lock(&kernfs_mutex); + + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; @@ -1586,21 +1901,21 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); pos; pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { - const char *name = pos->name; - unsigned int type = dt_type(pos); + const char *name = kernfs_rcu_name(pos); + unsigned int type = fs_umode_to_dtype(pos->mode); int len = strlen(name); - ino_t ino = pos->ino; + ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; kernfs_get(pos); - mutex_unlock(&kernfs_mutex); - if (!dir_emit(ctx, name, len, ino, type)) + if (!dir_emit(ctx, name, len, ino, type)) { + up_read(&root->kernfs_rwsem); return 0; - mutex_lock(&kernfs_mutex); + } } - mutex_unlock(&kernfs_mutex); + up_read(&root->kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; |
