diff options
Diffstat (limited to 'fs/dcache.c')
| -rw-r--r-- | fs/dcache.c | 4026 |
1 files changed, 2131 insertions, 1895 deletions
diff --git a/fs/dcache.c b/fs/dcache.c index 87bdb5329c3c..dc2fff4811d1 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * fs/dcache.c * @@ -14,41 +15,37 @@ * the dcache entry is deleted or garbage collected. */ -#include <linux/syscalls.h> +#include <linux/ratelimit.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> +#include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> -#include <linux/mount.h> -#include <linux/file.h> -#include <asm/uaccess.h> #include <linux/security.h> #include <linux/seqlock.h> -#include <linux/swap.h> -#include <linux/bootmem.h> -#include <linux/fs_struct.h> -#include <linux/hardirq.h> +#include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> -#include <linux/prefetch.h> -#include <linux/ratelimit.h> +#include <linux/list_lru.h> #include "internal.h" #include "mount.h" +#include <asm/runtime-const.h> + /* * Usage: * dcache->d_inode->i_lock protects: - * - i_dentry, d_alias, d_inode of aliases + * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table - * s_anon bl list spinlock protects: - * - the s_anon list (see __d_drop) - * dcache_lru_lock protects: + * s_roots bl list spinlock protects: + * - the s_roots list (see __d_drop) + * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags @@ -56,16 +53,16 @@ * - d_lru * - d_count * - d_unhashed() - * - d_parent and d_subdirs - * - childrens' d_child and d_parent - * - d_alias, d_inode + * - d_parent and d_chilren + * - childrens' d_sib and d_parent + * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock * dentry->d_lock - * dcache_lru_lock + * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock - * s_anon lock + * s_roots lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock @@ -74,19 +71,30 @@ * dentry->d_lock * * If no ancestor relationship: - * if (dentry1 < dentry2) - * dentry1->d_lock - * dentry2->d_lock + * arbitrary, since it's serialized on rename_lock */ -int sysctl_vfs_cache_pressure __read_mostly = 100; -EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); +static int sysctl_vfs_cache_pressure __read_mostly = 100; +static int sysctl_vfs_cache_pressure_denom __read_mostly = 100; + +unsigned long vfs_pressure_ratio(unsigned long val) +{ + return mult_frac(val, sysctl_vfs_cache_pressure, sysctl_vfs_cache_pressure_denom); +} +EXPORT_SYMBOL_GPL(vfs_pressure_ratio); -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); -static struct kmem_cache *dentry_cache __read_mostly; +static struct kmem_cache *__dentry_cache __ro_after_init; +#define dentry_cache runtime_const_ptr(__dentry_cache) + +const struct qstr empty_name = QSTR_INIT("", 0); +EXPORT_SYMBOL(empty_name); +const struct qstr slash_name = QSTR_INIT("/", 1); +EXPORT_SYMBOL(slash_name); +const struct qstr dotdot_name = QSTR_INIT("..", 2); +EXPORT_SYMBOL(dotdot_name); /* * This is the single most critical data structure when it comes @@ -95,46 +103,147 @@ static struct kmem_cache *dentry_cache __read_mostly; * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. + * + * Marking the variables "used" ensures that the compiler doesn't + * optimize them away completely on architectures with runtime + * constant infrastructure, this allows debuggers to see their + * values. But updating these values has no effect on those arches. */ -#define D_HASHBITS d_hash_shift -#define D_HASHMASK d_hash_mask -static unsigned int d_hash_mask __read_mostly; -static unsigned int d_hash_shift __read_mostly; +static unsigned int d_hash_shift __ro_after_init __used; + +static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; + +static inline struct hlist_bl_head *d_hash(unsigned long hashlen) +{ + return runtime_const_ptr(dentry_hashtable) + + runtime_const_shift_right_32(hashlen, d_hash_shift); +} -static struct hlist_bl_head *dentry_hashtable __read_mostly; +#define IN_LOOKUP_SHIFT 10 +static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; -static inline struct hlist_bl_head *d_hash(const struct dentry *parent, +static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; - hash = hash + (hash >> D_HASHBITS); - return dentry_hashtable + (hash & D_HASHMASK); + return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } -/* Statistics gathering. */ -struct dentry_stat_t dentry_stat = { - .age_limit = 45, +struct dentry_stat_t { + long nr_dentry; + long nr_unused; + long age_limit; /* age in seconds */ + long want_pages; /* pages requested by system */ + long nr_negative; /* # of unused negative dentries */ + long dummy; /* Reserved for future use */ }; -static DEFINE_PER_CPU(unsigned int, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry_unused); +static DEFINE_PER_CPU(long, nr_dentry_negative); +static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -static int get_nr_dentry(void) +/* Statistics gathering. */ +static struct dentry_stat_t dentry_stat = { + .age_limit = 45, +}; + +/* + * Here we resort to our own counters instead of using generic per-cpu counters + * for consistency with what the vfs inode code does. We are expected to harvest + * better code and performance by having our own specialized counters. + * + * Please note that the loop is done over all possible CPUs, not over all online + * CPUs. The reason for this is that we don't want to play games with CPUs going + * on and off. If one of them goes off, we will just keep their counters. + * + * glommer: See cffbc8a for details, and if you ever intend to change this, + * please update all vfs counters to match. + */ +static long get_nr_dentry(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } -int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, - size_t *lenp, loff_t *ppos) +static long get_nr_dentry_unused(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_unused, i); + return sum < 0 ? 0 : sum; +} + +static long get_nr_dentry_negative(void) +{ + int i; + long sum = 0; + + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_negative, i); + return sum < 0 ? 0 : sum; +} + +static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); - return proc_dointvec(table, write, buffer, lenp, ppos); + dentry_stat.nr_unused = get_nr_dentry_unused(); + dentry_stat.nr_negative = get_nr_dentry_negative(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} + +static const struct ctl_table fs_dcache_sysctls[] = { + { + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_dentry, + }, + { + .procname = "dentry-negative", + .data = &dentry_negative_policy, + .maxlen = sizeof(dentry_negative_policy), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static const struct ctl_table vm_dcache_sysctls[] = { + { + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "vfs_cache_pressure_denom", + .data = &sysctl_vfs_cache_pressure_denom, + .maxlen = sizeof(sysctl_vfs_cache_pressure_denom), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_HUNDRED, + }, +}; + +static int __init init_fs_dcache_sysctls(void) +{ + register_sysctl_init("vm", vm_dcache_sysctls); + register_sysctl_init("fs", fs_dcache_sysctls); + return 0; } +fs_initcall(init_fs_dcache_sysctls); #endif /* @@ -158,7 +267,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char unsigned long a,b,mask; for (;;) { - a = *(unsigned long *)cs; + a = read_word_at_a_time(cs); b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; @@ -170,7 +279,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char if (!tcount) return 0; } - mask = ~(~0ul << tcount*8); + mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } @@ -192,10 +301,9 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { - const unsigned char *cs; /* * Be careful about RCU walk racing with rename: - * use ACCESS_ONCE to fetch the name pointer. + * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The @@ -209,90 +317,146 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ - cs = ACCESS_ONCE(dentry->d_name.name); - smp_read_barrier_depends(); + const unsigned char *cs = READ_ONCE(dentry->d_name.name); + return dentry_string_cmp(cs, ct, tcount); } +/* + * long names are allocated separately from dentry and never modified. + * Refcounted, freeing is RCU-delayed. See take_dentry_name_snapshot() + * for the reason why ->count and ->head can't be combined into a union. + * dentry_string_cmp() relies upon ->name[] being word-aligned. + */ +struct external_name { + atomic_t count; + struct rcu_head head; + unsigned char name[] __aligned(sizeof(unsigned long)); +}; + +static inline struct external_name *external_name(struct dentry *dentry) +{ + return container_of(dentry->d_name.name, struct external_name, name[0]); +} + static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - WARN_ON(!hlist_unhashed(&dentry->d_alias)); - if (dname_external(dentry)) - kfree(dentry->d_name.name); kmem_cache_free(dentry_cache, dentry); } -/* - * no locks, please. - */ -static void d_free(struct dentry *dentry) +static void __d_free_external(struct rcu_head *head) { - BUG_ON(dentry->d_count); - this_cpu_dec(nr_dentry); - if (dentry->d_op && dentry->d_op->d_release) - dentry->d_op->d_release(dentry); - - /* if dentry was never visible to RCU, immediate free is OK */ - if (!(dentry->d_flags & DCACHE_RCUACCESS)) - __d_free(&dentry->d_u.d_rcu); - else - call_rcu(&dentry->d_u.d_rcu, __d_free); + struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + kfree(external_name(dentry)); + kmem_cache_free(dentry_cache, dentry); } -/** - * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups - * @dentry: the target dentry - * After this call, in-progress rcu-walk path lookup will fail. This - * should be called after unhashing, and after changing d_inode (if - * the dentry has not already been unhashed). - */ -static inline void dentry_rcuwalk_barrier(struct dentry *dentry) +static inline int dname_external(const struct dentry *dentry) { - assert_spin_locked(&dentry->d_lock); - /* Go through a barrier */ - write_seqcount_barrier(&dentry->d_seq); + return dentry->d_name.name != dentry->d_shortname.string; } -/* - * Release the dentry's inode, using the filesystem - * d_iput() operation if defined. Dentry has no refcount - * and is unhashed. - */ -static void dentry_iput(struct dentry * dentry) - __releases(dentry->d_lock) - __releases(dentry->d_inode->i_lock) +void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; - if (inode) { - dentry->d_inode = NULL; - hlist_del_init(&dentry->d_alias); - spin_unlock(&dentry->d_lock); - spin_unlock(&inode->i_lock); - if (!inode->i_nlink) - fsnotify_inoderemove(inode); - if (dentry->d_op && dentry->d_op->d_iput) - dentry->d_op->d_iput(dentry, inode); - else - iput(inode); + unsigned seq; + const unsigned char *s; + + rcu_read_lock(); +retry: + seq = read_seqcount_begin(&dentry->d_seq); + s = READ_ONCE(dentry->d_name.name); + name->name.hash_len = dentry->d_name.hash_len; + name->name.name = name->inline_name.string; + if (likely(s == dentry->d_shortname.string)) { + name->inline_name = dentry->d_shortname; } else { - spin_unlock(&dentry->d_lock); + struct external_name *p; + p = container_of(s, struct external_name, name[0]); + // get a valid reference + if (unlikely(!atomic_inc_not_zero(&p->count))) + goto retry; + name->name.name = s; + } + if (read_seqcount_retry(&dentry->d_seq, seq)) { + release_dentry_name_snapshot(name); + goto retry; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(take_dentry_name_snapshot); + +void release_dentry_name_snapshot(struct name_snapshot *name) +{ + if (unlikely(name->name.name != name->inline_name.string)) { + struct external_name *p; + p = container_of(name->name.name, struct external_name, name[0]); + if (unlikely(atomic_dec_and_test(&p->count))) + kfree_rcu(p, head); + } +} +EXPORT_SYMBOL(release_dentry_name_snapshot); + +static inline void __d_set_inode_and_type(struct dentry *dentry, + struct inode *inode, + unsigned type_flags) +{ + unsigned flags; + + dentry->d_inode = inode; + flags = READ_ONCE(dentry->d_flags); + flags &= ~DCACHE_ENTRY_TYPE; + flags |= type_flags; + smp_store_release(&dentry->d_flags, flags); +} + +static inline void __d_clear_type_and_inode(struct dentry *dentry) +{ + unsigned flags = READ_ONCE(dentry->d_flags); + + flags &= ~DCACHE_ENTRY_TYPE; + WRITE_ONCE(dentry->d_flags, flags); + dentry->d_inode = NULL; + /* + * The negative counter only tracks dentries on the LRU. Don't inc if + * d_lru is on another list. + */ + if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) + this_cpu_inc(nr_dentry_negative); +} + +static void dentry_free(struct dentry *dentry) +{ + WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + if (unlikely(dname_external(dentry))) { + struct external_name *p = external_name(dentry); + if (likely(atomic_dec_and_test(&p->count))) { + call_rcu(&dentry->d_u.d_rcu, __d_free_external); + return; + } } + /* if dentry was never visible to RCU, immediate free is OK */ + if (dentry->d_flags & DCACHE_NORCU) + __d_free(&dentry->d_u.d_rcu); + else + call_rcu(&dentry->d_u.d_rcu, __d_free); } /* * Release the dentry's inode, using the filesystem - * d_iput() operation if defined. dentry remains in-use. + * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; - dentry->d_inode = NULL; - hlist_del_init(&dentry->d_alias); - dentry_rcuwalk_barrier(dentry); + + raw_write_seqcount_begin(&dentry->d_seq); + __d_clear_type_and_inode(dentry); + hlist_del_init(&dentry->d_u.d_alias); + raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -304,106 +468,116 @@ static void dentry_unlink_inode(struct dentry * dentry) } /* - * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held. + * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry + * is in use - which includes both the "real" per-superblock + * LRU list _and_ the DCACHE_SHRINK_LIST use. + * + * The DCACHE_SHRINK_LIST bit is set whenever the dentry is + * on the shrink list (ie not on the superblock LRU list). + * + * The per-cpu "nr_dentry_unused" counters are updated with + * the DCACHE_LRU_LIST bit. + * + * The per-cpu "nr_dentry_negative" counters are only updated + * when deleted from or added to the per-superblock LRU list, not + * from/to the shrink list. That is to avoid an unneeded dec/inc + * pair when moving from LRU to shrink list in select_collect(). + * + * These helper functions make sure we always follow the + * rules. d_lock must be held by the caller. */ -static void dentry_lru_add(struct dentry *dentry) +#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) +static void d_lru_add(struct dentry *dentry) { - if (list_empty(&dentry->d_lru)) { - spin_lock(&dcache_lru_lock); - list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; - spin_unlock(&dcache_lru_lock); - } + D_FLAG_VERIFY(dentry, 0); + dentry->d_flags |= DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_inc(nr_dentry_negative); + WARN_ON_ONCE(!list_lru_add_obj( + &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } -static void __dentry_lru_del(struct dentry *dentry) +static void d_lru_del(struct dentry *dentry) { + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); + WARN_ON_ONCE(!list_lru_del_obj( + &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); +} + +static void d_shrink_del(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); - dentry->d_flags &= ~DCACHE_SHRINK_LIST; - dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; + dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); + this_cpu_dec(nr_dentry_unused); +} + +static void d_shrink_add(struct dentry *dentry, struct list_head *list) +{ + D_FLAG_VERIFY(dentry, 0); + list_add(&dentry->d_lru, list); + dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); } /* - * Remove a dentry with references from the LRU. + * These can only be called under the global LRU lock, ie during the + * callback for freeing the LRU list. "isolate" removes it from the + * LRU lists entirely, while shrink_move moves it to the indicated + * private list. */ -static void dentry_lru_del(struct dentry *dentry) +static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { - if (!list_empty(&dentry->d_lru)) { - spin_lock(&dcache_lru_lock); - __dentry_lru_del(dentry); - spin_unlock(&dcache_lru_lock); - } + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); + list_lru_isolate(lru, &dentry->d_lru); } -static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) +static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, + struct list_head *list) { - spin_lock(&dcache_lru_lock); - if (list_empty(&dentry->d_lru)) { - list_add_tail(&dentry->d_lru, list); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; - } else { - list_move_tail(&dentry->d_lru, list); - } - spin_unlock(&dcache_lru_lock); + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags |= DCACHE_SHRINK_LIST; + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); + list_lru_isolate_move(lru, &dentry->d_lru, list); } -/** - * d_kill - kill dentry and return parent - * @dentry: dentry to kill - * @parent: parent dentry - * - * The dentry must already be unhashed and removed from the LRU. - * - * If this is the root of the dentry tree, return NULL. - * - * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by - * d_kill. - */ -static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) - __releases(dentry->d_lock) - __releases(parent->d_lock) - __releases(dentry->d_inode->i_lock) +static void ___d_drop(struct dentry *dentry) { - list_del(&dentry->d_u.d_child); + struct hlist_bl_head *b; /* - * Inform try_to_ascend() that we are no longer attached to the - * dentry tree + * Hashed dentries are normally on the dentry hashtable, + * with the exception of those newly allocated by + * d_obtain_root, which are always IS_ROOT: */ - dentry->d_flags |= DCACHE_DENTRY_KILLED; - if (parent) - spin_unlock(&parent->d_lock); - dentry_iput(dentry); - /* - * dentry_iput drops the locks, at which point nobody (except - * transient RCU lookups) can reach this dentry. - */ - d_free(dentry); - return parent; + if (unlikely(IS_ROOT(dentry))) + b = &dentry->d_sb->s_roots; + else + b = d_hash(dentry->d_name.hash); + + hlist_bl_lock(b); + __hlist_bl_del(&dentry->d_hash); + hlist_bl_unlock(b); } -/* - * Unhash a dentry without inserting an RCU walk barrier or checking that - * dentry->d_lock is locked. The caller must take care of that, if - * appropriate. - */ -static void __d_shrink(struct dentry *dentry) +void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { - struct hlist_bl_head *b; - if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) - b = &dentry->d_sb->s_anon; - else - b = d_hash(dentry->d_parent, dentry->d_name.hash); - - hlist_bl_lock(b); - __hlist_bl_del(&dentry->d_hash); + ___d_drop(dentry); dentry->d_hash.pprev = NULL; - hlist_bl_unlock(b); + write_seqcount_invalidate(&dentry->d_seq); } } +EXPORT_SYMBOL(__d_drop); /** * d_drop - drop a dentry @@ -418,17 +592,11 @@ static void __d_shrink(struct dentry *dentry) * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * - * __d_drop requires dentry->d_lock. + * __d_drop requires dentry->d_lock + * + * ___d_drop doesn't mark dentry as "unhashed" + * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). */ -void __d_drop(struct dentry *dentry) -{ - if (!d_unhashed(dentry)) { - __d_shrink(dentry); - dentry_rcuwalk_barrier(dentry); - } -} -EXPORT_SYMBOL(__d_drop); - void d_drop(struct dentry *dentry) { spin_lock(&dentry->d_lock); @@ -437,37 +605,54 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); -/* - * Finish off a dentry we've decided to kill. - * dentry->d_lock must be held, returns with it unlocked. - * If ref is non-zero, then decrement the refcount too. - * Returns dentry requiring refcount drop, or NULL if we're done. - */ -static inline struct dentry *dentry_kill(struct dentry *dentry, int ref) - __releases(dentry->d_lock) +static inline void dentry_unlist(struct dentry *dentry) { - struct inode *inode; - struct dentry *parent; - - inode = dentry->d_inode; - if (inode && !spin_trylock(&inode->i_lock)) { -relock: - spin_unlock(&dentry->d_lock); - cpu_relax(); - return dentry; /* try again with same dentry */ - } - if (IS_ROOT(dentry)) - parent = NULL; - else - parent = dentry->d_parent; - if (parent && !spin_trylock(&parent->d_lock)) { - if (inode) - spin_unlock(&inode->i_lock); - goto relock; + struct dentry *next; + /* + * Inform d_walk() and shrink_dentry_list() that we are no longer + * attached to the dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (unlikely(hlist_unhashed(&dentry->d_sib))) + return; + __hlist_del(&dentry->d_sib); + /* + * Cursors can move around the list of children. While we'd been + * a normal list member, it didn't matter - ->d_sib.next would've + * been updated. However, from now on it won't be and for the + * things like d_walk() it might end up with a nasty surprise. + * Normally d_walk() doesn't care about cursors moving around - + * ->d_lock on parent prevents that and since a cursor has no children + * of its own, we get through it without ever unlocking the parent. + * There is one exception, though - if we ascend from a child that + * gets killed as soon as we unlock it, the next sibling is found + * using the value left in its ->d_sib.next. And if _that_ + * pointed to a cursor, and cursor got moved (e.g. by lseek()) + * before d_walk() regains parent->d_lock, we'll end up skipping + * everything the cursor had been moved past. + * + * Solution: make sure that the pointer left behind in ->d_sib.next + * points to something that won't be moving around. I.e. skip the + * cursors. + */ + while (dentry->d_sib.next) { + next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib); + if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) + break; + dentry->d_sib.next = next->d_sib.next; } +} + +static struct dentry *__dentry_kill(struct dentry *dentry) +{ + struct dentry *parent = NULL; + bool can_free = true; + + /* + * The dentry is now unrecoverably dead to the world. + */ + lockref_mark_dead(&dentry->d_lockref); - if (ref) - dentry->d_count--; /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. @@ -475,10 +660,233 @@ relock: if (dentry->d_flags & DCACHE_OP_PRUNE) dentry->d_op->d_prune(dentry); - dentry_lru_del(dentry); + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) + d_lru_del(dentry); + } /* if it was on the hash then remove it */ __d_drop(dentry); - return d_kill(dentry, parent); + if (dentry->d_inode) + dentry_unlink_inode(dentry); + else + spin_unlock(&dentry->d_lock); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + cond_resched(); + /* now that it's negative, ->d_parent is stable */ + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + spin_lock(&parent->d_lock); + } + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + dentry_unlist(dentry); + if (dentry->d_flags & DCACHE_SHRINK_LIST) + can_free = false; + spin_unlock(&dentry->d_lock); + if (likely(can_free)) + dentry_free(dentry); + if (parent && --parent->d_lockref.count) { + spin_unlock(&parent->d_lock); + return NULL; + } + return parent; +} + +/* + * Lock a dentry for feeding it to __dentry_kill(). + * Called under rcu_read_lock() and dentry->d_lock; the former + * guarantees that nothing we access will be freed under us. + * Note that dentry is *not* protected from concurrent dentry_kill(), + * d_delete(), etc. + * + * Return false if dentry is busy. Otherwise, return true and have + * that dentry's inode locked. + */ + +static bool lock_for_kill(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (unlikely(dentry->d_lockref.count)) + return false; + + if (!inode || likely(spin_trylock(&inode->i_lock))) + return true; + + do { + spin_unlock(&dentry->d_lock); + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); + if (likely(inode == dentry->d_inode)) + break; + spin_unlock(&inode->i_lock); + inode = dentry->d_inode; + } while (inode); + if (likely(!dentry->d_lockref.count)) + return true; + if (inode) + spin_unlock(&inode->i_lock); + return false; +} + +/* + * Decide if dentry is worth retaining. Usually this is called with dentry + * locked; if not locked, we are more limited and might not be able to tell + * without a lock. False in this case means "punt to locked path and recheck". + * + * In case we aren't locked, these predicates are not "stable". However, it is + * sufficient that at some point after we dropped the reference the dentry was + * hashed and the flags had the proper value. Other dentry users may have + * re-gotten a reference to the dentry and change that, but our work is done - + * we can leave the dentry around with a zero refcount. + */ +static inline bool retain_dentry(struct dentry *dentry, bool locked) +{ + unsigned int d_flags; + + smp_rmb(); + d_flags = READ_ONCE(dentry->d_flags); + + // Unreachable? Nobody would be able to look it up, no point retaining + if (unlikely(d_unhashed(dentry))) + return false; + + // Same if it's disconnected + if (unlikely(d_flags & DCACHE_DISCONNECTED)) + return false; + + // ->d_delete() might tell us not to bother, but that requires + // ->d_lock; can't decide without it + if (unlikely(d_flags & DCACHE_OP_DELETE)) { + if (!locked || dentry->d_op->d_delete(dentry)) + return false; + } + + // Explicitly told not to bother + if (unlikely(d_flags & DCACHE_DONTCACHE)) + return false; + + // At this point it looks like we ought to keep it. We also might + // need to do something - put it on LRU if it wasn't there already + // and mark it referenced if it was on LRU, but not marked yet. + // Unfortunately, both actions require ->d_lock, so in lockless + // case we'd have to punt rather than doing those. + if (unlikely(!(d_flags & DCACHE_LRU_LIST))) { + if (!locked) + return false; + d_lru_add(dentry); + } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) { + if (!locked) + return false; + dentry->d_flags |= DCACHE_REFERENCED; + } + return true; +} + +void d_mark_dontcache(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { + spin_lock(&de->d_lock); + de->d_flags |= DCACHE_DONTCACHE; + spin_unlock(&de->d_lock); + } + inode_state_set(inode, I_DONTCACHE); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_mark_dontcache); + +/* + * Try to do a lockless dput(), and return whether that was successful. + * + * If unsuccessful, we return false, having already taken the dentry lock. + * In that case refcount is guaranteed to be zero and we have already + * decided that it's not worth keeping around. + * + * The caller needs to hold the RCU read lock, so that the dentry is + * guaranteed to stay around even if the refcount goes down to zero! + */ +static inline bool fast_dput(struct dentry *dentry) +{ + int ret; + + /* + * try to decrement the lockref optimistically. + */ + ret = lockref_put_return(&dentry->d_lockref); + + /* + * If the lockref_put_return() failed due to the lock being held + * by somebody else, the fast path has failed. We will need to + * get the lock, and then check the count again. + */ + if (unlikely(ret < 0)) { + spin_lock(&dentry->d_lock); + if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { + spin_unlock(&dentry->d_lock); + return true; + } + dentry->d_lockref.count--; + goto locked; + } + + /* + * If we weren't the last ref, we're done. + */ + if (ret) + return true; + + /* + * Can we decide that decrement of refcount is all we needed without + * taking the lock? There's a very common case when it's all we need - + * dentry looks like it ought to be retained and there's nothing else + * to do. + */ + if (retain_dentry(dentry, false)) + return true; + + /* + * Either not worth retaining or we can't tell without the lock. + * Get the lock, then. We've already decremented the refcount to 0, + * but we'll need to re-check the situation after getting the lock. + */ + spin_lock(&dentry->d_lock); + + /* + * Did somebody else grab a reference to it in the meantime, and + * we're no longer the last user after all? Alternatively, somebody + * else could have killed it and marked it dead. Either way, we + * don't need to do anything else. + */ +locked: + if (dentry->d_lockref.count || retain_dentry(dentry, true)) { + spin_unlock(&dentry->d_lock); + return true; + } + return false; +} + +static void finish_dput(struct dentry *dentry) + __releases(dentry->d_lock) + __releases(RCU) +{ + while (lock_for_kill(dentry)) { + rcu_read_unlock(); + dentry = __dentry_kill(dentry); + if (!dentry) + return; + if (retain_dentry(dentry, true)) { + spin_unlock(&dentry->d_lock); + return; + } + rcu_read_lock(); + } + rcu_read_unlock(); + spin_unlock(&dentry->d_lock); } /* @@ -511,114 +919,69 @@ void dput(struct dentry *dentry) { if (!dentry) return; - -repeat: - if (dentry->d_count == 1) - might_sleep(); - spin_lock(&dentry->d_lock); - BUG_ON(!dentry->d_count); - if (dentry->d_count > 1) { - dentry->d_count--; - spin_unlock(&dentry->d_lock); + might_sleep(); + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); return; } - - if (dentry->d_flags & DCACHE_OP_DELETE) { - if (dentry->d_op->d_delete(dentry)) - goto kill_it; - } - - /* Unreachable? Get rid of it */ - if (d_unhashed(dentry)) - goto kill_it; - - dentry->d_flags |= DCACHE_REFERENCED; - dentry_lru_add(dentry); - - dentry->d_count--; - spin_unlock(&dentry->d_lock); - return; - -kill_it: - dentry = dentry_kill(dentry, 1); - if (dentry) - goto repeat; + finish_dput(dentry); } EXPORT_SYMBOL(dput); -/** - * d_invalidate - invalidate a dentry - * @dentry: dentry to invalidate - * - * Try to invalidate the dentry if it turns out to be - * possible. If there are other dentries that can be - * reached through this one we can't delete it and we - * return -EBUSY. On success we return 0. - * - * no dcache lock. - */ - -int d_invalidate(struct dentry * dentry) +void d_make_discardable(struct dentry *dentry) { - /* - * If it's already been dropped, return OK. - */ spin_lock(&dentry->d_lock); - if (d_unhashed(dentry)) { - spin_unlock(&dentry->d_lock); - return 0; - } - /* - * Check whether to do a partial shrink_dcache - * to get rid of unused child entries. - */ - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&dentry->d_lock); - shrink_dcache_parent(dentry); - spin_lock(&dentry->d_lock); - } - - /* - * Somebody else still using it? - * - * If it's a directory, we can't drop it - * for fear of somebody re-populating it - * with children (even though dropping it - * would make it unreachable from the root, - * we might still populate it if it was a - * working directory or similar). - * We also need to leave mountpoints alone, - * directory or not. - */ - if (dentry->d_count > 1 && dentry->d_inode) { - if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { - spin_unlock(&dentry->d_lock); - return -EBUSY; - } - } - - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - return 0; + WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT)); + dentry->d_flags &= ~DCACHE_PERSISTENT; + dentry->d_lockref.count--; + rcu_read_lock(); + finish_dput(dentry); } -EXPORT_SYMBOL(d_invalidate); +EXPORT_SYMBOL(d_make_discardable); -/* This must be called with d_lock held */ -static inline void __dget_dlock(struct dentry *dentry) +static void to_shrink_list(struct dentry *dentry, struct list_head *list) +__must_hold(&dentry->d_lock) { - dentry->d_count++; + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + d_shrink_add(dentry, list); + } } -static inline void __dget(struct dentry *dentry) +void dput_to_list(struct dentry *dentry, struct list_head *list) { - spin_lock(&dentry->d_lock); - __dget_dlock(dentry); + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } struct dentry *dget_parent(struct dentry *dentry) { + int gotref; struct dentry *ret; + unsigned seq; + + /* + * Do optimistic parent lookup without any + * locking. + */ + rcu_read_lock(); + seq = raw_seqcount_begin(&dentry->d_seq); + ret = READ_ONCE(dentry->d_parent); + gotref = lockref_get_not_zero(&ret->d_lockref); + rcu_read_unlock(); + if (likely(gotref)) { + if (!read_seqcount_retry(&dentry->d_seq, seq)) + return ret; + dput(ret); + } repeat: /* @@ -634,73 +997,82 @@ repeat: goto repeat; } rcu_read_unlock(); - BUG_ON(!ret->d_count); - ret->d_count++; + BUG_ON(!ret->d_lockref.count); + ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } EXPORT_SYMBOL(dget_parent); +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (hlist_empty(&inode->i_dentry)) + return NULL; + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + lockref_get(&alias->d_lockref); + return alias; +} + /** - * d_find_alias - grab a hashed alias of inode - * @inode: inode in question - * @want_discon: flag, used by d_splice_alias, to request - * that only a DISCONNECTED alias be returned. - * - * If inode has a hashed alias, or is a directory and has any alias, - * acquire the reference to alias and return it. Otherwise return NULL. - * Notice that if inode is a directory there can be only one alias and - * it can be unhashed only if it has no children, or if it is the root - * of a filesystem. + * d_find_any_alias - find any alias for a given inode + * @inode: inode to find an alias for * - * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer - * any other hashed alias over that one unless @want_discon is set, - * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. + * If any aliases exist for the given inode, take and return a + * reference for one of them. If no aliases exist, return %NULL. */ -static struct dentry *__d_find_alias(struct inode *inode, int want_discon) +struct dentry *d_find_any_alias(struct inode *inode) { - struct dentry *alias, *discon_alias; + struct dentry *de; -again: - discon_alias = NULL; - hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { - spin_lock(&alias->d_lock); - if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { - if (IS_ROOT(alias) && - (alias->d_flags & DCACHE_DISCONNECTED)) { - discon_alias = alias; - } else if (!want_discon) { - __dget_dlock(alias); - spin_unlock(&alias->d_lock); - return alias; - } - } - spin_unlock(&alias->d_lock); - } - if (discon_alias) { - alias = discon_alias; + spin_lock(&inode->i_lock); + de = __d_find_any_alias(inode); + spin_unlock(&inode->i_lock); + return de; +} +EXPORT_SYMBOL(d_find_any_alias); + +static struct dentry *__d_find_alias(struct inode *inode) +{ + struct dentry *alias; + + if (S_ISDIR(inode->i_mode)) + return __d_find_any_alias(inode); + + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); - if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { - if (IS_ROOT(alias) && - (alias->d_flags & DCACHE_DISCONNECTED)) { - __dget_dlock(alias); - spin_unlock(&alias->d_lock); - return alias; - } + if (!d_unhashed(alias)) { + dget_dlock(alias); + spin_unlock(&alias->d_lock); + return alias; } spin_unlock(&alias->d_lock); - goto again; } return NULL; } +/** + * d_find_alias - grab a hashed alias of inode + * @inode: inode in question + * + * If inode has a hashed alias, or is a directory and has any alias, + * acquire the reference to alias and return it. Otherwise return NULL. + * Notice that if inode is a directory there can be only one alias and + * it can be unhashed only if it has no children, or if it is the root + * of a filesystem, or if the directory was renamed and d_revalidate + * was the first vfs operation to notice. + * + * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer + * any other hashed alias over that one. + */ struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); - de = __d_find_alias(inode, 0); + de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; @@ -708,157 +1080,194 @@ struct dentry *d_find_alias(struct inode *inode) EXPORT_SYMBOL(d_find_alias); /* + * Caller MUST be holding rcu_read_lock() and be guaranteed + * that inode won't get freed until rcu_read_unlock(). + */ +struct dentry *d_find_alias_rcu(struct inode *inode) +{ + struct hlist_head *l = &inode->i_dentry; + struct dentry *de = NULL; + + spin_lock(&inode->i_lock); + // ->i_dentry and ->i_rcu are colocated, but the latter won't be + // used without having I_FREEING set, which means no aliases left + if (likely(!(inode_state_read(inode) & I_FREEING) && !hlist_empty(l))) { + if (S_ISDIR(inode->i_mode)) { + de = hlist_entry(l->first, struct dentry, d_u.d_alias); + } else { + hlist_for_each_entry(de, l, d_u.d_alias) + if (!d_unhashed(de)) + break; + } + } + spin_unlock(&inode->i_lock); + return de; +} + +void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose) +{ + spin_lock(&dentry->d_lock); + if (!dentry->d_lockref.count) + to_shrink_list(dentry, dispose); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_dispose_if_unused); + +/* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { + LIST_HEAD(dispose); struct dentry *dentry; -restart: + spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { - spin_lock(&dentry->d_lock); - if (!dentry->d_count) { - __dget_dlock(dentry); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&inode->i_lock); - dput(dentry); - goto restart; - } - spin_unlock(&dentry->d_lock); - } + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) + d_dispose_if_unused(dentry, &dispose); spin_unlock(&inode->i_lock); + shrink_dentry_list(&dispose); } EXPORT_SYMBOL(d_prune_aliases); -/* - * Try to throw away a dentry - free the inode, dput the parent. - * Requires dentry->d_lock is held, and dentry->d_count == 0. - * Releases dentry->d_lock. - * - * This may fail if locks cannot be acquired no problem, just try again. - */ -static void try_prune_one_dentry(struct dentry *dentry) - __releases(dentry->d_lock) +static inline void shrink_kill(struct dentry *victim) { - struct dentry *parent; + do { + rcu_read_unlock(); + victim = __dentry_kill(victim); + rcu_read_lock(); + } while (victim && lock_for_kill(victim)); + rcu_read_unlock(); + if (victim) + spin_unlock(&victim->d_lock); +} - parent = dentry_kill(dentry, 0); - /* - * If dentry_kill returns NULL, we have nothing more to do. - * if it returns the same dentry, trylocks failed. In either - * case, just loop again. - * - * Otherwise, we need to prune ancestors too. This is necessary - * to prevent quadratic behavior of shrink_dcache_parent(), but - * is also expected to be beneficial in reducing dentry cache - * fragmentation. - */ - if (!parent) - return; - if (parent == dentry) - return; +void shrink_dentry_list(struct list_head *list) +{ + while (!list_empty(list)) { + struct dentry *dentry; - /* Prune ancestors. */ - dentry = parent; - while (dentry) { + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); - if (dentry->d_count > 1) { - dentry->d_count--; + rcu_read_lock(); + if (!lock_for_kill(dentry)) { + bool can_free; + rcu_read_unlock(); + d_shrink_del(dentry); + can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); - return; + if (can_free) + dentry_free(dentry); + continue; } - dentry = dentry_kill(dentry, 1); + d_shrink_del(dentry); + shrink_kill(dentry); } } +EXPORT_SYMBOL(shrink_dentry_list); -static void shrink_dentry_list(struct list_head *list) +static enum lru_status dentry_lru_isolate(struct list_head *item, + struct list_lru_one *lru, void *arg) { - struct dentry *dentry; + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); - rcu_read_lock(); - for (;;) { - dentry = list_entry_rcu(list->prev, struct dentry, d_lru); - if (&dentry->d_lru == list) - break; /* empty */ - spin_lock(&dentry->d_lock); - if (dentry != list_entry(list->prev, struct dentry, d_lru)) { - spin_unlock(&dentry->d_lock); - continue; - } - /* - * We found an inuse dentry which was not removed from - * the LRU because of laziness during lookup. Do not free - * it - just keep it off the LRU list. - */ - if (dentry->d_count) { - dentry_lru_del(dentry); - spin_unlock(&dentry->d_lock); - continue; - } + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; - rcu_read_unlock(); + /* + * Referenced dentries are still in use. If they have active + * counts, just remove them from the LRU. Otherwise give them + * another pass through the LRU. + */ + if (dentry->d_lockref.count) { + d_lru_isolate(lru, dentry); + spin_unlock(&dentry->d_lock); + return LRU_REMOVED; + } - try_prune_one_dentry(dentry); + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + spin_unlock(&dentry->d_lock); - rcu_read_lock(); + /* + * The list move itself will be made by the common LRU code. At + * this point, we've dropped the dentry->d_lock but keep the + * lru lock. This is safe to do, since every list movement is + * protected by the lru lock even if both locks are held. + * + * This is guaranteed by the fact that all LRU management + * functions are intermediated by the LRU API calls like + * list_lru_add_obj and list_lru_del_obj. List movement in this file + * only ever occur through this functions or through callbacks + * like this one, that are called from the LRU API. + * + * The only exceptions to this are functions like + * shrink_dentry_list, and code that first checks for the + * DCACHE_SHRINK_LIST flag. Those are guaranteed to be + * operating only with stack provided lists after they are + * properly isolated from the main list. It is thus, always a + * local access. + */ + return LRU_ROTATE; } - rcu_read_unlock(); + + d_lru_shrink_move(lru, dentry, freeable); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; } /** * prune_dcache_sb - shrink the dcache * @sb: superblock - * @count: number of entries to try to free + * @sc: shrink control, passed to list_lru_shrink_walk() * - * Attempt to shrink the superblock dcache LRU by @count entries. This is - * done when we need more memory an called from the superblock shrinker + * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This + * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ -void prune_dcache_sb(struct super_block *sb, int count) +long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { - struct dentry *dentry; - LIST_HEAD(referenced); - LIST_HEAD(tmp); - -relock: - spin_lock(&dcache_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - dentry = list_entry(sb->s_dentry_lru.prev, - struct dentry, d_lru); - BUG_ON(dentry->d_sb != sb); - - if (!spin_trylock(&dentry->d_lock)) { - spin_unlock(&dcache_lru_lock); - cpu_relax(); - goto relock; - } + LIST_HEAD(dispose); + long freed; - if (dentry->d_flags & DCACHE_REFERENCED) { - dentry->d_flags &= ~DCACHE_REFERENCED; - list_move(&dentry->d_lru, &referenced); - spin_unlock(&dentry->d_lock); - } else { - list_move_tail(&dentry->d_lru, &tmp); - dentry->d_flags |= DCACHE_SHRINK_LIST; - spin_unlock(&dentry->d_lock); - if (!--count) - break; - } - cond_resched_lock(&dcache_lru_lock); - } - if (!list_empty(&referenced)) - list_splice(&referenced, &sb->s_dentry_lru); - spin_unlock(&dcache_lru_lock); + freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, + dentry_lru_isolate, &dispose); + shrink_dentry_list(&dispose); + return freed; +} + +static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, + struct list_lru_one *lru, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + d_lru_shrink_move(lru, dentry, freeable); + spin_unlock(&dentry->d_lock); - shrink_dentry_list(&tmp); + return LRU_REMOVED; } + /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock @@ -868,202 +1277,89 @@ relock: */ void shrink_dcache_sb(struct super_block *sb) { - LIST_HEAD(tmp); + do { + LIST_HEAD(dispose); - spin_lock(&dcache_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - list_splice_init(&sb->s_dentry_lru, &tmp); - spin_unlock(&dcache_lru_lock); - shrink_dentry_list(&tmp); - spin_lock(&dcache_lru_lock); - } - spin_unlock(&dcache_lru_lock); + list_lru_walk(&sb->s_dentry_lru, + dentry_lru_isolate_shrink, &dispose, 1024); + shrink_dentry_list(&dispose); + } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); -/* - * destroy a single subtree of dentries for unmount - * - see the comments on shrink_dcache_for_umount() for a description of the - * locking - */ -static void shrink_dcache_for_umount_subtree(struct dentry *dentry) -{ - struct dentry *parent; - - BUG_ON(!IS_ROOT(dentry)); - - for (;;) { - /* descend to the first leaf in the current subtree */ - while (!list_empty(&dentry->d_subdirs)) - dentry = list_entry(dentry->d_subdirs.next, - struct dentry, d_u.d_child); - - /* consume the dentries from this leaf up through its parents - * until we find one with children or run out altogether */ - do { - struct inode *inode; - - /* - * inform the fs that this dentry is about to be - * unhashed and destroyed. - */ - if (dentry->d_flags & DCACHE_OP_PRUNE) - dentry->d_op->d_prune(dentry); - - dentry_lru_del(dentry); - __d_shrink(dentry); - - if (dentry->d_count != 0) { - printk(KERN_ERR - "BUG: Dentry %p{i=%lx,n=%s}" - " still in use (%d)" - " [unmount of %s %s]\n", - dentry, - dentry->d_inode ? - dentry->d_inode->i_ino : 0UL, - dentry->d_name.name, - dentry->d_count, - dentry->d_sb->s_type->name, - dentry->d_sb->s_id); - BUG(); - } - - if (IS_ROOT(dentry)) { - parent = NULL; - list_del(&dentry->d_u.d_child); - } else { - parent = dentry->d_parent; - parent->d_count--; - list_del(&dentry->d_u.d_child); - } - - inode = dentry->d_inode; - if (inode) { - dentry->d_inode = NULL; - hlist_del_init(&dentry->d_alias); - if (dentry->d_op && dentry->d_op->d_iput) - dentry->d_op->d_iput(dentry, inode); - else - iput(inode); - } - - d_free(dentry); - - /* finished when we fall off the top of the tree, - * otherwise we ascend to the parent and move to the - * next sibling if there is one */ - if (!parent) - return; - dentry = parent; - } while (list_empty(&dentry->d_subdirs)); - - dentry = list_entry(dentry->d_subdirs.next, - struct dentry, d_u.d_child); - } -} - -/* - * destroy the dentries attached to a superblock on unmounting - * - we don't need to use dentry->d_lock because: - * - the superblock is detached from all mountings and open files, so the - * dentry trees will not be rearranged by the VFS - * - s_umount is write-locked, so the memory pressure shrinker will ignore - * any dentries belonging to this superblock that it comes across - * - the filesystem itself is no longer permitted to rearrange the dentries - * in this superblock - */ -void shrink_dcache_for_umount(struct super_block *sb) -{ - struct dentry *dentry; - - if (down_read_trylock(&sb->s_umount)) - BUG(); - - dentry = sb->s_root; - sb->s_root = NULL; - dentry->d_count--; - shrink_dcache_for_umount_subtree(dentry); - - while (!hlist_bl_empty(&sb->s_anon)) { - dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash); - shrink_dcache_for_umount_subtree(dentry); - } -} - -/* - * This tries to ascend one level of parenthood, but - * we can race with renaming, so we need to re-check - * the parenthood after dropping the lock and check - * that the sequence number still matches. - */ -static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) -{ - struct dentry *new = old->d_parent; - - rcu_read_lock(); - spin_unlock(&old->d_lock); - spin_lock(&new->d_lock); - - /* - * might go back up the wrong parent if we have had a rename - * or deletion - */ - if (new != old->d_parent || - (old->d_flags & DCACHE_DENTRY_KILLED) || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&new->d_lock); - new = NULL; - } - rcu_read_unlock(); - return new; -} - +/** + * enum d_walk_ret - action to talke during tree walk + * @D_WALK_CONTINUE: contrinue walk + * @D_WALK_QUIT: quit walk + * @D_WALK_NORETRY: quit when retry is needed + * @D_WALK_SKIP: skip this dentry and its children + */ +enum d_walk_ret { + D_WALK_CONTINUE, + D_WALK_QUIT, + D_WALK_NORETRY, + D_WALK_SKIP, +}; -/* - * Search for at least 1 mount point in the dentry's subdirs. - * We descend to the next level whenever the d_subdirs - * list is non-empty and continue searching. - */ - /** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. + * d_walk - walk the dentry tree + * @parent: start of walk + * @data: data passed to @enter() and @finish() + * @enter: callback when first entering the dentry * - * Return true if the parent or its subdirectories contain - * a mount point + * The @enter() callbacks are called with d_lock held. */ -int have_submounts(struct dentry *parent) +static void d_walk(struct dentry *parent, void *data, + enum d_walk_ret (*enter)(void *, struct dentry *)) { - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int locked = 0; + struct dentry *this_parent, *dentry; + unsigned seq = 0; + enum d_walk_ret ret; + bool retry = true; - seq = read_seqbegin(&rename_lock); again: + read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; - - if (d_mountpoint(parent)) - goto positive; spin_lock(&this_parent->d_lock); + + ret = enter(data, this_parent); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: + case D_WALK_SKIP: + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + } repeat: - next = this_parent->d_subdirs.next; + dentry = d_first_child(this_parent); resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; + hlist_for_each_entry_from(dentry, d_sib) { + if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) + continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - /* Have we found a mount point ? */ - if (d_mountpoint(dentry)) { + + ret = enter(data, dentry); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: spin_unlock(&dentry->d_lock); - spin_unlock(&this_parent->d_lock); - goto positive; + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + case D_WALK_SKIP: + spin_unlock(&dentry->d_lock); + continue; } - if (!list_empty(&dentry->d_subdirs)) { + + if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); + spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; @@ -1073,41 +1369,126 @@ resume: /* * All done at this level ... ascend and resume the search. */ + rcu_read_lock(); +ascend: if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) + dentry = this_parent; + this_parent = dentry->d_parent; + + spin_unlock(&dentry->d_lock); + spin_lock(&this_parent->d_lock); + + /* might go back up the wrong parent if we have had a rename. */ + if (need_seqretry(&rename_lock, seq)) goto rename_retry; - next = child->d_u.d_child.next; - goto resume; + /* go into the first sibling still alive */ + hlist_for_each_entry_continue(dentry, d_sib) { + if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { + rcu_read_unlock(); + goto resume; + } + } + goto ascend; } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return 0; /* No mount points found in tree */ -positive: - if (!locked && read_seqretry(&rename_lock, seq)) + if (need_seqretry(&rename_lock, seq)) goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return 1; + rcu_read_unlock(); + +out_unlock: + spin_unlock(&this_parent->d_lock); + done_seqretry(&rename_lock, seq); + return; rename_retry: - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); + BUG_ON(seq & 1); + if (!retry) + return; + seq = 1; goto again; } -EXPORT_SYMBOL(have_submounts); + +struct check_mount { + struct vfsmount *mnt; + unsigned int mounted; +}; + +/* locks: mount_locked_reader && dentry->d_lock */ +static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) +{ + struct check_mount *info = data; + struct path path = { .mnt = info->mnt, .dentry = dentry }; + + if (likely(!d_mountpoint(dentry))) + return D_WALK_CONTINUE; + if (__path_is_mountpoint(&path)) { + info->mounted = 1; + return D_WALK_QUIT; + } + return D_WALK_CONTINUE; +} + +/** + * path_has_submounts - check for mounts over a dentry in the + * current namespace. + * @parent: path to check. + * + * Return true if the parent or its subdirectories contain + * a mount point in the current namespace. + */ +int path_has_submounts(const struct path *parent) +{ + struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; + + guard(mount_locked_reader)(); + d_walk(parent->dentry, &data, path_check_mount); + + return data.mounted; +} +EXPORT_SYMBOL(path_has_submounts); + +/* + * Called by mount code to set a mountpoint and check if the mountpoint is + * reachable (e.g. NFS can unhash a directory dentry and then the complete + * subtree can become unreachable). + * + * Only one of d_invalidate() and d_set_mounted() must succeed. For + * this reason take rename_lock and d_lock on dentry and ancestors. + */ +int d_set_mounted(struct dentry *dentry) +{ + struct dentry *p; + int ret = -ENOENT; + read_seqlock_excl(&rename_lock); + for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { + /* Need exclusion wrt. d_invalidate() */ + spin_lock(&p->d_lock); + if (unlikely(d_unhashed(p))) { + spin_unlock(&p->d_lock); + goto out; + } + spin_unlock(&p->d_lock); + } + spin_lock(&dentry->d_lock); + if (!d_unlinked(dentry)) { + ret = -EBUSY; + if (!d_mountpoint(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } + } + spin_unlock(&dentry->d_lock); +out: + read_sequnlock_excl(&rename_lock); + return ret; +} /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level - * whenever the d_subdirs list is non-empty and continue + * whenever the d_children list is non-empty and continue * searching. * * It returns zero iff there are no unused children, @@ -1117,113 +1498,219 @@ EXPORT_SYMBOL(have_submounts); * drop the lock and return early due to latency * constraints. */ -static int select_parent(struct dentry *parent, struct list_head *dispose) + +struct select_data { + struct dentry *start; + union { + long found; + struct dentry *victim; + }; + struct list_head dispose; +}; + +static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int found = 0; - int locked = 0; + struct select_data *data = _data; + enum d_walk_ret ret = D_WALK_CONTINUE; - seq = read_seqbegin(&rename_lock); -again: - this_parent = parent; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; + if (data->start == dentry) + goto out; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + data->found++; + } else if (!dentry->d_lockref.count) { + to_shrink_list(dentry, &data->dispose); + data->found++; + } else if (dentry->d_lockref.count < 0) { + data->found++; + } + /* + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. + */ + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; +out: + return ret; +} - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - dentry_lru_move_list(dentry, dispose); - dentry->d_flags |= DCACHE_SHRINK_LIST; - found++; - } - /* - * We can return to the caller if we have found some (this - * ensures forward progress). We'll be coming back to find - * the rest. - */ - if (found && need_resched()) { - spin_unlock(&dentry->d_lock); - goto out; - } +static enum d_walk_ret select_collect_umount(void *_data, struct dentry *dentry) +{ + if (dentry->d_flags & DCACHE_PERSISTENT) { + dentry->d_flags &= ~DCACHE_PERSISTENT; + dentry->d_lockref.count--; + } + return select_collect(_data, dentry); +} - /* - * Descend a level if the d_subdirs list is non-empty. - */ - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } +static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + enum d_walk_ret ret = D_WALK_CONTINUE; - spin_unlock(&dentry->d_lock); + if (data->start == dentry) + goto out; + + if (!dentry->d_lockref.count) { + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + rcu_read_lock(); + data->victim = dentry; + return D_WALK_QUIT; + } + to_shrink_list(dentry, &data->dispose); } /* - * All done at this level ... ascend and resume the search. + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. */ - if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; - } + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return found; - -rename_retry: - if (found) - return found; - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; + return ret; } /** - * shrink_dcache_parent - prune dcache + * shrink_dcache_tree - prune dcache * @parent: parent of entries to prune + * @for_umount: true if we want to unpin the persistent ones * * Prune the dcache to remove unused children of the parent dentry. */ -void shrink_dcache_parent(struct dentry * parent) +static void shrink_dcache_tree(struct dentry *parent, bool for_umount) { - LIST_HEAD(dispose); - int found; + for (;;) { + struct select_data data = {.start = parent}; + + INIT_LIST_HEAD(&data.dispose); + d_walk(parent, &data, + for_umount ? select_collect_umount : select_collect); + + if (!list_empty(&data.dispose)) { + shrink_dentry_list(&data.dispose); + continue; + } - while ((found = select_parent(parent, &dispose)) != 0) { - shrink_dentry_list(&dispose); cond_resched(); + if (!data.found) + break; + data.victim = NULL; + d_walk(parent, &data, select_collect2); + if (data.victim) { + spin_lock(&data.victim->d_lock); + if (!lock_for_kill(data.victim)) { + spin_unlock(&data.victim->d_lock); + rcu_read_unlock(); + } else { + shrink_kill(data.victim); + } + } + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); } } + +void shrink_dcache_parent(struct dentry *parent) +{ + shrink_dcache_tree(parent, false); +} EXPORT_SYMBOL(shrink_dcache_parent); +static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) +{ + /* it has busy descendents; complain about those instead */ + if (!hlist_empty(&dentry->d_children)) + return D_WALK_CONTINUE; + + /* root with refcount 1 is fine */ + if (dentry == _data && dentry->d_lockref.count == 1) + return D_WALK_CONTINUE; + + WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " + " still in use (%d) [unmount of %s %s]\n", + dentry, + dentry->d_inode ? + dentry->d_inode->i_ino : 0UL, + dentry, + dentry->d_lockref.count, + dentry->d_sb->s_type->name, + dentry->d_sb->s_id); + return D_WALK_CONTINUE; +} + +static void do_one_tree(struct dentry *dentry) +{ + shrink_dcache_tree(dentry, true); + d_walk(dentry, dentry, umount_check); + d_drop(dentry); + dput(dentry); +} + +/* + * destroy the dentries attached to a superblock on unmounting + */ +void shrink_dcache_for_umount(struct super_block *sb) +{ + struct dentry *dentry; + + rwsem_assert_held_write(&sb->s_umount); + + dentry = sb->s_root; + sb->s_root = NULL; + do_one_tree(dentry); + + while (!hlist_bl_empty(&sb->s_roots)) { + dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); + do_one_tree(dentry); + } +} + +static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) +{ + struct dentry **victim = _data; + if (d_mountpoint(dentry)) { + *victim = dget_dlock(dentry); + return D_WALK_QUIT; + } + return D_WALK_CONTINUE; +} + +/** + * d_invalidate - detach submounts, prune dcache, and drop + * @dentry: dentry to invalidate (aka detach, prune and drop) + */ +void d_invalidate(struct dentry *dentry) +{ + bool had_submounts = false; + spin_lock(&dentry->d_lock); + if (d_unhashed(dentry)) { + spin_unlock(&dentry->d_lock); + return; + } + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + + /* Negative dentries can be dropped without further checks */ + if (!dentry->d_inode) + return; + + shrink_dcache_parent(dentry); + for (;;) { + struct dentry *victim = NULL; + d_walk(dentry, &victim, find_submount); + if (!victim) { + if (had_submounts) + shrink_dcache_parent(dentry); + return; + } + had_submounts = true; + detach_mounts(victim); + dput(victim); + } +} +EXPORT_SYMBOL(d_invalidate); + /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to @@ -1234,12 +1721,14 @@ EXPORT_SYMBOL(shrink_dcache_parent); * copied and the copy passed in may be reused after this call. */ -struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) +static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; + int err; - dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru, + GFP_KERNEL); if (!dentry) return NULL; @@ -1249,41 +1738,57 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ - dentry->d_iname[DNAME_INLINE_LEN-1] = 0; - if (name->len > DNAME_INLINE_LEN-1) { - dname = kmalloc(name->len + 1, GFP_KERNEL); - if (!dname) { + dentry->d_shortname.string[DNAME_INLINE_LEN-1] = 0; + if (unlikely(!name)) { + name = &slash_name; + dname = dentry->d_shortname.string; + } else if (name->len > DNAME_INLINE_LEN-1) { + size_t size = offsetof(struct external_name, name[1]); + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT | + __GFP_RECLAIMABLE); + if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } + atomic_set(&p->count, 1); + dname = p->name; } else { - dname = dentry->d_iname; + dname = dentry->d_shortname.string; } - dentry->d_name.len = name->len; - dentry->d_name.hash = name->hash; + dentry->__d_name.len = name->len; + dentry->__d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ - smp_wmb(); - dentry->d_name.name = dname; + smp_store_release(&dentry->__d_name.name, dname); /* ^^^ */ - dentry->d_count = 1; dentry->d_flags = 0; - spin_lock_init(&dentry->d_lock); - seqcount_init(&dentry->d_seq); + lockref_init(&dentry->d_lockref); + seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; - dentry->d_op = NULL; + dentry->d_op = sb->__s_d_op; + dentry->d_flags = sb->s_d_flags; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); - INIT_LIST_HEAD(&dentry->d_subdirs); - INIT_HLIST_NODE(&dentry->d_alias); - INIT_LIST_HEAD(&dentry->d_u.d_child); - d_set_d_op(dentry, dentry->d_sb->s_d_op); + INIT_HLIST_HEAD(&dentry->d_children); + INIT_HLIST_NODE(&dentry->d_u.d_alias); + INIT_HLIST_NODE(&dentry->d_sib); + + if (dentry->d_op && dentry->d_op->d_init) { + err = dentry->d_op->d_init(dentry); + if (err) { + if (dname_external(dentry)) + kfree(external_name(dentry)); + kmem_cache_free(dentry_cache, dentry); + return NULL; + } + } this_cpu_inc(nr_dentry); @@ -1304,80 +1809,172 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; - spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject * to concurrency here */ - __dget_dlock(parent); - dentry->d_parent = parent; - list_add(&dentry->d_u.d_child, &parent->d_subdirs); + dentry->d_parent = dget_dlock(parent); + hlist_add_head(&dentry->d_sib, &parent->d_children); spin_unlock(&parent->d_lock); return dentry; } EXPORT_SYMBOL(d_alloc); +struct dentry *d_alloc_anon(struct super_block *sb) +{ + return __d_alloc(sb, NULL); +} +EXPORT_SYMBOL(d_alloc_anon); + +struct dentry *d_alloc_cursor(struct dentry * parent) +{ + struct dentry *dentry = d_alloc_anon(parent->d_sb); + if (dentry) { + dentry->d_flags |= DCACHE_DENTRY_CURSOR; + dentry->d_parent = dget(parent); + } + return dentry; +} + +/** + * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) + * @sb: the superblock + * @name: qstr of the name + * + * For a filesystem that just pins its dentries in memory and never + * performs lookups at all, return an unhashed IS_ROOT dentry. + * This is used for pipes, sockets et.al. - the stuff that should + * never be anyone's children or parents. Unlike all other + * dentries, these will not have RCU delay between dropping the + * last reference and freeing them. + * + * The only user is alloc_file_pseudo() and that's what should + * be considered a public interface. Don't use directly. + */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { + static const struct dentry_operations anon_ops = { + .d_dname = simple_dname + }; struct dentry *dentry = __d_alloc(sb, name); - if (dentry) - dentry->d_flags |= DCACHE_DISCONNECTED; + if (likely(dentry)) { + dentry->d_flags |= DCACHE_NORCU; + /* d_op_flags(&anon_ops) is 0 */ + if (!dentry->d_op) + dentry->d_op = &anon_ops; + } return dentry; } -EXPORT_SYMBOL(d_alloc_pseudo); struct dentry *d_alloc_name(struct dentry *parent, const char *name) { struct qstr q; q.name = name; - q.len = strlen(name); - q.hash = full_name_hash(q.name, q.len); + q.hash_len = hashlen_string(parent, name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); -void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) +#define DCACHE_OP_FLAGS \ + (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | \ + DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_PRUNE | \ + DCACHE_OP_REAL) + +static unsigned int d_op_flags(const struct dentry_operations *op) { + unsigned int flags = 0; + if (op) { + if (op->d_hash) + flags |= DCACHE_OP_HASH; + if (op->d_compare) + flags |= DCACHE_OP_COMPARE; + if (op->d_revalidate) + flags |= DCACHE_OP_REVALIDATE; + if (op->d_weak_revalidate) + flags |= DCACHE_OP_WEAK_REVALIDATE; + if (op->d_delete) + flags |= DCACHE_OP_DELETE; + if (op->d_prune) + flags |= DCACHE_OP_PRUNE; + if (op->d_real) + flags |= DCACHE_OP_REAL; + } + return flags; +} + +static void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) +{ + unsigned int flags = d_op_flags(op); WARN_ON_ONCE(dentry->d_op); - WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | - DCACHE_OP_COMPARE | - DCACHE_OP_REVALIDATE | - DCACHE_OP_WEAK_REVALIDATE | - DCACHE_OP_DELETE )); + WARN_ON_ONCE(dentry->d_flags & DCACHE_OP_FLAGS); dentry->d_op = op; - if (!op) - return; - if (op->d_hash) - dentry->d_flags |= DCACHE_OP_HASH; - if (op->d_compare) - dentry->d_flags |= DCACHE_OP_COMPARE; - if (op->d_revalidate) - dentry->d_flags |= DCACHE_OP_REVALIDATE; - if (op->d_weak_revalidate) - dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; - if (op->d_delete) - dentry->d_flags |= DCACHE_OP_DELETE; - if (op->d_prune) - dentry->d_flags |= DCACHE_OP_PRUNE; - -} -EXPORT_SYMBOL(d_set_d_op); + if (flags) + dentry->d_flags |= flags; +} -static void __d_instantiate(struct dentry *dentry, struct inode *inode) +void set_default_d_op(struct super_block *s, const struct dentry_operations *ops) { - spin_lock(&dentry->d_lock); - if (inode) { - if (unlikely(IS_AUTOMOUNT(inode))) - dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; - hlist_add_head(&dentry->d_alias, &inode->i_dentry); + unsigned int flags = d_op_flags(ops); + s->__s_d_op = ops; + s->s_d_flags = (s->s_d_flags & ~DCACHE_OP_FLAGS) | flags; +} +EXPORT_SYMBOL(set_default_d_op); + +static unsigned d_flags_for_inode(struct inode *inode) +{ + unsigned add_flags = DCACHE_REGULAR_TYPE; + + if (!inode) + return DCACHE_MISS_TYPE; + + if (S_ISDIR(inode->i_mode)) { + add_flags = DCACHE_DIRECTORY_TYPE; + if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { + if (unlikely(!inode->i_op->lookup)) + add_flags = DCACHE_AUTODIR_TYPE; + else + inode->i_opflags |= IOP_LOOKUP; + } + goto type_determined; } - dentry->d_inode = inode; - dentry_rcuwalk_barrier(dentry); - spin_unlock(&dentry->d_lock); - fsnotify_d_instantiate(dentry, inode); + + if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { + if (unlikely(inode->i_op->get_link)) { + add_flags = DCACHE_SYMLINK_TYPE; + goto type_determined; + } + inode->i_opflags |= IOP_NOFOLLOW; + } + + if (unlikely(!S_ISREG(inode->i_mode))) + add_flags = DCACHE_SPECIAL_TYPE; + +type_determined: + if (unlikely(IS_AUTOMOUNT(inode))) + add_flags |= DCACHE_NEED_AUTOMOUNT; + return add_flags; +} + +static void __d_instantiate(struct dentry *dentry, struct inode *inode) +{ + unsigned add_flags = d_flags_for_inode(inode); + WARN_ON(d_in_lookup(dentry)); + + /* + * The negative counter only tracks dentries on the LRU. Don't dec if + * d_lru is on another list. + */ + if ((dentry->d_flags & + (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) + this_cpu_dec(nr_dentry_negative); + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + raw_write_seqcount_begin(&dentry->d_seq); + __d_set_inode_and_type(dentry, inode, add_flags); + raw_write_seqcount_end(&dentry->d_seq); + fsnotify_update_flags(dentry); } /** @@ -1397,99 +1994,47 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) void d_instantiate(struct dentry *entry, struct inode * inode) { - BUG_ON(!hlist_unhashed(&entry->d_alias)); - if (inode) + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + if (inode) { + security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); - __d_instantiate(entry, inode); - if (inode) + spin_lock(&entry->d_lock); + __d_instantiate(entry, inode); + spin_unlock(&entry->d_lock); spin_unlock(&inode->i_lock); - security_d_instantiate(entry, inode); + } } EXPORT_SYMBOL(d_instantiate); -/** - * d_instantiate_unique - instantiate a non-aliased dentry - * @entry: dentry to instantiate - * @inode: inode to attach to this dentry - * - * Fill in inode information in the entry. On success, it returns NULL. - * If an unhashed alias of "entry" already exists, then we return the - * aliased dentry instead and drop one reference to inode. - * - * Note that in order to avoid conflicts with rename() etc, the caller - * had better be holding the parent directory semaphore. - * - * This also assumes that the inode count has been incremented - * (or otherwise set) by the caller to indicate that it is now - * in use by the dcache. +/* + * This should be equivalent to d_instantiate() + unlock_new_inode(), + * with lockdep-related part of unlock_new_inode() done before + * anything else. Use that instead of open-coding d_instantiate()/ + * unlock_new_inode() combinations. */ -static struct dentry *__d_instantiate_unique(struct dentry *entry, - struct inode *inode) +void d_instantiate_new(struct dentry *entry, struct inode *inode) { - struct dentry *alias; - int len = entry->d_name.len; - const char *name = entry->d_name.name; - unsigned int hash = entry->d_name.hash; - - if (!inode) { - __d_instantiate(entry, NULL); - return NULL; - } - - hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { - /* - * Don't need alias->d_lock here, because aliases with - * d_parent == entry->d_parent are not subject to name or - * parent changes, because the parent inode i_mutex is held. - */ - if (alias->d_name.hash != hash) - continue; - if (alias->d_parent != entry->d_parent) - continue; - if (alias->d_name.len != len) - continue; - if (dentry_cmp(alias, name, len)) - continue; - __dget(alias); - return alias; - } - + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + BUG_ON(!inode); + lockdep_annotate_inode_mutex_key(inode); + security_d_instantiate(entry, inode); + spin_lock(&inode->i_lock); + spin_lock(&entry->d_lock); __d_instantiate(entry, inode); - return NULL; -} - -struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) -{ - struct dentry *result; - - BUG_ON(!hlist_unhashed(&entry->d_alias)); - - if (inode) - spin_lock(&inode->i_lock); - result = __d_instantiate_unique(entry, inode); - if (inode) - spin_unlock(&inode->i_lock); - - if (!result) { - security_d_instantiate(entry, inode); - return NULL; - } - - BUG_ON(!d_unhashed(result)); - iput(inode); - return result; + spin_unlock(&entry->d_lock); + WARN_ON(!(inode_state_read(inode) & I_NEW)); + inode_state_clear(inode, I_NEW | I_CREATING); + inode_wake_up_bit(inode, __I_NEW); + spin_unlock(&inode->i_lock); } - -EXPORT_SYMBOL(d_instantiate_unique); +EXPORT_SYMBOL(d_instantiate_new); struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; if (root_inode) { - static const struct qstr name = QSTR_INIT("/", 1); - - res = __d_alloc(root_inode->i_sb, &name); + res = d_alloc_anon(root_inode->i_sb); if (res) d_instantiate(res, root_inode); else @@ -1499,37 +2044,61 @@ struct dentry *d_make_root(struct inode *root_inode) } EXPORT_SYMBOL(d_make_root); -static struct dentry * __d_find_any_alias(struct inode *inode) +static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { - struct dentry *alias; + struct super_block *sb; + struct dentry *new, *res; - if (hlist_empty(&inode->i_dentry)) - return NULL; - alias = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); - __dget(alias); - return alias; -} + if (!inode) + return ERR_PTR(-ESTALE); + if (IS_ERR(inode)) + return ERR_CAST(inode); -/** - * d_find_any_alias - find any alias for a given inode - * @inode: inode to find an alias for - * - * If any aliases exist for the given inode, take and return a - * reference for one of them. If no aliases exist, return %NULL. - */ -struct dentry *d_find_any_alias(struct inode *inode) -{ - struct dentry *de; + sb = inode->i_sb; + + res = d_find_any_alias(inode); /* existing alias? */ + if (res) + goto out; + + new = d_alloc_anon(sb); + if (!new) { + res = ERR_PTR(-ENOMEM); + goto out; + } + security_d_instantiate(new, inode); spin_lock(&inode->i_lock); - de = __d_find_any_alias(inode); - spin_unlock(&inode->i_lock); - return de; + res = __d_find_any_alias(inode); /* recheck under lock */ + if (likely(!res)) { /* still no alias, attach a disconnected dentry */ + unsigned add_flags = d_flags_for_inode(inode); + + if (disconnected) + add_flags |= DCACHE_DISCONNECTED; + + spin_lock(&new->d_lock); + __d_set_inode_and_type(new, inode, add_flags); + hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); + if (!disconnected) { + hlist_bl_lock(&sb->s_roots); + hlist_bl_add_head(&new->d_hash, &sb->s_roots); + hlist_bl_unlock(&sb->s_roots); + } + spin_unlock(&new->d_lock); + spin_unlock(&inode->i_lock); + inode = NULL; /* consumed by new->d_inode */ + res = new; + } else { + spin_unlock(&inode->i_lock); + dput(new); + } + + out: + iput(inode); + return res; } -EXPORT_SYMBOL(d_find_any_alias); /** - * d_obtain_alias - find or allocate a dentry for a given inode + * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain a dentry for an inode resulting from NFS filehandle conversion or @@ -1543,124 +2112,48 @@ EXPORT_SYMBOL(d_find_any_alias); * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is released. * To make it easier to use in export operations a %NULL or IS_ERR inode may - * be passed in and will be the error will be propagate to the return value, + * be passed in and the error will be propagated to the return value, * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_alias(struct inode *inode) { - static const struct qstr anonstring = QSTR_INIT("/", 1); - struct dentry *tmp; - struct dentry *res; - - if (!inode) - return ERR_PTR(-ESTALE); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - res = d_find_any_alias(inode); - if (res) - goto out_iput; - - tmp = __d_alloc(inode->i_sb, &anonstring); - if (!tmp) { - res = ERR_PTR(-ENOMEM); - goto out_iput; - } - - spin_lock(&inode->i_lock); - res = __d_find_any_alias(inode); - if (res) { - spin_unlock(&inode->i_lock); - dput(tmp); - goto out_iput; - } - - /* attach a disconnected dentry */ - spin_lock(&tmp->d_lock); - tmp->d_inode = inode; - tmp->d_flags |= DCACHE_DISCONNECTED; - hlist_add_head(&tmp->d_alias, &inode->i_dentry); - hlist_bl_lock(&tmp->d_sb->s_anon); - hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); - hlist_bl_unlock(&tmp->d_sb->s_anon); - spin_unlock(&tmp->d_lock); - spin_unlock(&inode->i_lock); - security_d_instantiate(tmp, inode); - - return tmp; - - out_iput: - if (res && !IS_ERR(res)) - security_d_instantiate(res, inode); - iput(inode); - return res; + return __d_obtain_alias(inode, true); } EXPORT_SYMBOL(d_obtain_alias); /** - * d_splice_alias - splice a disconnected dentry into the tree if one exists - * @inode: the inode which may have a disconnected dentry - * @dentry: a negative dentry which we want to point to the inode. - * - * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and - * DCACHE_DISCONNECTED), then d_move that in place of the given dentry - * and return it, else simply d_add the inode to the dentry and return NULL. + * d_obtain_root - find or allocate a dentry for a given inode + * @inode: inode to allocate the dentry for * - * This is needed in the lookup routine of any filesystem that is exportable - * (via knfsd) so that we can build dcache paths to directories effectively. + * Obtain an IS_ROOT dentry for the root of a filesystem. * - * If a dentry was found and moved, then it is returned. Otherwise NULL - * is returned. This matches the expected return value of ->lookup. + * We must ensure that directory inodes only ever have one dentry. If a + * dentry is found, that is returned instead of allocating a new one. * - * Cluster filesystems may call this function with a negative, hashed dentry. - * In that case, we know that the inode will be a regular file, and also this - * will only occur during atomic_open. So we need to check for the dentry - * being already hashed only in the final case. + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is + * released. A %NULL or IS_ERR inode may be passed in and will be the + * error will be propagate to the return value, with a %NULL @inode + * replaced by ERR_PTR(-ESTALE). */ -struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) +struct dentry *d_obtain_root(struct inode *inode) { - struct dentry *new = NULL; - - if (IS_ERR(inode)) - return ERR_CAST(inode); - - if (inode && S_ISDIR(inode->i_mode)) { - spin_lock(&inode->i_lock); - new = __d_find_alias(inode, 1); - if (new) { - BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); - spin_unlock(&inode->i_lock); - security_d_instantiate(new, inode); - d_move(new, dentry); - iput(inode); - } else { - /* already taking inode->i_lock, so d_add() by hand */ - __d_instantiate(dentry, inode); - spin_unlock(&inode->i_lock); - security_d_instantiate(dentry, inode); - d_rehash(dentry); - } - } else { - d_instantiate(dentry, inode); - if (d_unhashed(dentry)) - d_rehash(dentry); - } - return new; + return __d_obtain_alias(inode, false); } -EXPORT_SYMBOL(d_splice_alias); +EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name - * @inode: the inode case-insensitive lookup has found * @dentry: the negative dentry that was passed to the parent's lookup func + * @inode: the inode case-insensitive lookup has found * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * - * For a case-insensitive lookup match and if the the case-exact dentry - * already exists in in the dcache, use it and return it. + * For a case-insensitive lookup match and if the case-exact dentry + * already exists in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. @@ -1668,100 +2161,103 @@ EXPORT_SYMBOL(d_splice_alias); struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { - struct dentry *found; - struct dentry *new; + struct dentry *found, *res; /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); - if (unlikely(IS_ERR(found))) - goto err_out; - if (!found) { - new = d_alloc(dentry->d_parent, name); - if (!new) { - found = ERR_PTR(-ENOMEM); - goto err_out; - } - - found = d_splice_alias(inode, new); - if (found) { - dput(new); - return found; - } - return new; - } - - /* - * If a matching dentry exists, and it's not negative use it. - * - * Decrement the reference count to balance the iget() done - * earlier on. - */ - if (found->d_inode) { - if (unlikely(found->d_inode != inode)) { - /* This can't happen because bad inodes are unhashed. */ - BUG_ON(!is_bad_inode(inode)); - BUG_ON(!is_bad_inode(found->d_inode)); - } + if (found) { iput(inode); return found; } - - /* - * Negative dentry: instantiate it unless the inode is a directory and - * already has a dentry. - */ - new = d_splice_alias(inode, found); - if (new) { + if (d_in_lookup(dentry)) { + found = d_alloc_parallel(dentry->d_parent, name, + dentry->d_wait); + if (IS_ERR(found) || !d_in_lookup(found)) { + iput(inode); + return found; + } + } else { + found = d_alloc(dentry->d_parent, name); + if (!found) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + } + res = d_splice_alias(inode, found); + if (res) { + d_lookup_done(found); dput(found); - found = new; + return res; } return found; - -err_out: - iput(inode); - return found; } EXPORT_SYMBOL(d_add_ci); -/* - * Do the slow-case of the dentry name compare. - * - * Unlike the dentry_cmp() function, we need to atomically - * load the name and length information, so that the - * filesystem can rely on them, and can use the 'name' and - * 'len' information without worrying about walking off the - * end of memory etc. - * - * Thus the read_seqcount_retry() and the "duplicate" info - * in arguments (the low-level filesystem should not look - * at the dentry inode or name contents directly, since - * rename can change them while we're in RCU mode). +/** + * d_same_name - compare dentry name with case-exact name + * @dentry: the negative dentry that was passed to the parent's lookup func + * @parent: parent dentry + * @name: the case-exact name to be associated with the returned dentry + * + * Return: true if names are same, or false */ -enum slow_d_compare { - D_COMP_OK, - D_COMP_NOMATCH, - D_COMP_SEQRETRY, -}; +bool d_same_name(const struct dentry *dentry, const struct dentry *parent, + const struct qstr *name) +{ + if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { + if (dentry->d_name.len != name->len) + return false; + return dentry_cmp(dentry, name->name, name->len) == 0; + } + return parent->d_op->d_compare(dentry, + dentry->d_name.len, dentry->d_name.name, + name) == 0; +} +EXPORT_SYMBOL_GPL(d_same_name); -static noinline enum slow_d_compare slow_dentry_cmp( - const struct dentry *parent, - struct dentry *dentry, - unsigned int seq, - const struct qstr *name) +/* + * This is __d_lookup_rcu() when the parent dentry has + * DCACHE_OP_COMPARE, which makes things much nastier. + */ +static noinline struct dentry *__d_lookup_rcu_op_compare( + const struct dentry *parent, + const struct qstr *name, + unsigned *seqp) { - int tlen = dentry->d_name.len; - const char *tname = dentry->d_name.name; + u64 hashlen = name->hash_len; + struct hlist_bl_head *b = d_hash(hashlen); + struct hlist_bl_node *node; + struct dentry *dentry; - if (read_seqcount_retry(&dentry->d_seq, seq)) { - cpu_relax(); - return D_COMP_SEQRETRY; + hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + int tlen; + const char *tname; + unsigned seq; + +seqretry: + seq = raw_seqcount_begin(&dentry->d_seq); + if (dentry->d_parent != parent) + continue; + if (d_unhashed(dentry)) + continue; + if (dentry->d_name.hash != hashlen_hash(hashlen)) + continue; + tlen = dentry->d_name.len; + tname = dentry->d_name.name; + /* we want a consistent (name,len) pair */ + if (read_seqcount_retry(&dentry->d_seq, seq)) { + cpu_relax(); + goto seqretry; + } + if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) + continue; + *seqp = seq; + return dentry; } - if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) - return D_COMP_NOMATCH; - return D_COMP_OK; + return NULL; } /** @@ -1782,9 +2278,6 @@ static noinline enum slow_d_compare slow_dentry_cmp( * without taking d_lock and checking d_seq sequence count against @seq * returned here. * - * A refcount may be taken on the found dentry with the __d_rcu_to_refcount - * function. - * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks @@ -1799,7 +2292,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, { u64 hashlen = name->hash_len; const unsigned char *str = name->name; - struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen)); + struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; @@ -1810,6 +2303,9 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, * Keep the two functions in sync. */ + if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) + return __d_lookup_rcu_op_compare(parent, name, seqp); + /* * The hash list is protected using RCU. * @@ -1826,7 +2322,6 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; -seqretry: /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. @@ -1840,32 +2335,30 @@ seqretry: * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. + * + * Note that raw_seqcount_begin still *does* smp_rmb(), so + * we are still guaranteed NUL-termination of ->d_name.name. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; - if (d_unhashed(dentry)) - continue; - - if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { - if (dentry->d_name.hash != hashlen_hash(hashlen)) - continue; - *seqp = seq; - switch (slow_dentry_cmp(parent, dentry, seq, name)) { - case D_COMP_OK: - return dentry; - case D_COMP_NOMATCH: - continue; - default: - goto seqretry; - } - } - if (dentry->d_name.hash_len != hashlen) continue; + if (unlikely(dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)) + continue; + /* + * Check for the dentry being unhashed. + * + * As tempting as it is, we *can't* skip it because of a race window + * between us finding the dentry before it gets unhashed and loading + * the sequence counter after unhashing is finished. + * + * We can at least predict on it. + */ + if (unlikely(d_unhashed(dentry))) + continue; *seqp = seq; - if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) - return dentry; + return dentry; } return NULL; } @@ -1886,10 +2379,10 @@ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) struct dentry *dentry; unsigned seq; - do { - seq = read_seqbegin(&rename_lock); - dentry = __d_lookup(parent, name); - if (dentry) + do { + seq = read_seqbegin(&rename_lock); + dentry = __d_lookup(parent, name); + if (dentry) break; } while (read_seqretry(&rename_lock, seq)); return dentry; @@ -1913,10 +2406,8 @@ EXPORT_SYMBOL(d_lookup); */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { - unsigned int len = name->len; unsigned int hash = name->hash; - const unsigned char *str = name->name; - struct hlist_bl_head *b = d_hash(parent, hash); + struct hlist_bl_head *b = d_hash(hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; @@ -1954,23 +2445,10 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) if (d_unhashed(dentry)) goto next; - /* - * It is safe to compare names since d_move() cannot - * change the qstr (protected by d_lock). - */ - if (parent->d_flags & DCACHE_OP_COMPARE) { - int tlen = dentry->d_name.len; - const char *tname = dentry->d_name.name; - if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) - goto next; - } else { - if (dentry->d_name.len != len) - goto next; - if (dentry_cmp(dentry, str, len)) - goto next; - } + if (!d_same_name(dentry, parent, name)) + goto next; - dentry->d_count++; + dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; @@ -1996,7 +2474,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ - name->hash = full_name_hash(name->name, name->len); + name->hash = full_name_hash(dir, name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) @@ -2004,38 +2482,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) } return d_lookup(dir, name); } -EXPORT_SYMBOL(d_hash_and_lookup); - -/** - * d_validate - verify dentry provided from insecure source (deprecated) - * @dentry: The dentry alleged to be valid child of @dparent - * @dparent: The parent dentry (known to be valid) - * - * An insecure source has sent us a dentry, here we verify it and dget() it. - * This is used by ncpfs in its readdir implementation. - * Zero is returned in the dentry is invalid. - * - * This function is slow for big directories, and deprecated, do not use it. - */ -int d_validate(struct dentry *dentry, struct dentry *dparent) -{ - struct dentry *child; - - spin_lock(&dparent->d_lock); - list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) { - if (dentry == child) { - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - __dget_dlock(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&dparent->d_lock); - return 1; - } - } - spin_unlock(&dparent->d_lock); - - return 0; -} -EXPORT_SYMBOL(d_validate); /* * When a file is deleted, we have two options: @@ -2060,50 +2506,35 @@ EXPORT_SYMBOL(d_validate); void d_delete(struct dentry * dentry) { - struct inode *inode; - int isdir = 0; + struct inode *inode = dentry->d_inode; + + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); /* * Are we the only user? */ -again: - spin_lock(&dentry->d_lock); - inode = dentry->d_inode; - isdir = S_ISDIR(inode->i_mode); - if (dentry->d_count == 1) { - if (!spin_trylock(&inode->i_lock)) { - spin_unlock(&dentry->d_lock); - cpu_relax(); - goto again; - } + if (dentry->d_lockref.count == 1) { + if (dentry_negative_policy) + __d_drop(dentry); dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); - fsnotify_nameremove(dentry, isdir); - return; - } - - if (!d_unhashed(dentry)) + } else { __d_drop(dentry); - - spin_unlock(&dentry->d_lock); - - fsnotify_nameremove(dentry, isdir); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + } } EXPORT_SYMBOL(d_delete); -static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b) +static void __d_rehash(struct dentry *entry) { - BUG_ON(!d_unhashed(entry)); + struct hlist_bl_head *b = d_hash(entry->d_name.hash); + hlist_bl_lock(b); - entry->d_flags |= DCACHE_RCUACCESS; hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } -static void _d_rehash(struct dentry * entry) -{ - __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash)); -} - /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash @@ -2114,188 +2545,410 @@ static void _d_rehash(struct dentry * entry) void d_rehash(struct dentry * entry) { spin_lock(&entry->d_lock); - _d_rehash(entry); + __d_rehash(entry); spin_unlock(&entry->d_lock); } EXPORT_SYMBOL(d_rehash); +static inline unsigned start_dir_add(struct inode *dir) +{ + preempt_disable_nested(); + for (;;) { + unsigned n = READ_ONCE(dir->i_dir_seq); + if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1)) + return n; + cpu_relax(); + } +} + +static inline void end_dir_add(struct inode *dir, unsigned int n, + wait_queue_head_t *d_wait) +{ + smp_store_release(&dir->i_dir_seq, n + 2); + preempt_enable_nested(); + if (wq_has_sleeper(d_wait)) + wake_up_all(d_wait); +} + +static void d_wait_lookup(struct dentry *dentry) +{ + if (d_in_lookup(dentry)) { + DECLARE_WAITQUEUE(wait, current); + add_wait_queue(dentry->d_wait, &wait); + do { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&dentry->d_lock); + schedule(); + spin_lock(&dentry->d_lock); + } while (d_in_lookup(dentry)); + } +} + +struct dentry *d_alloc_parallel(struct dentry *parent, + const struct qstr *name, + wait_queue_head_t *wq) +{ + unsigned int hash = name->hash; + struct hlist_bl_head *b = in_lookup_hash(parent, hash); + struct hlist_bl_node *node; + struct dentry *new = __d_alloc(parent->d_sb, name); + struct dentry *dentry; + unsigned seq, r_seq, d_seq; + + if (unlikely(!new)) + return ERR_PTR(-ENOMEM); + + new->d_flags |= DCACHE_PAR_LOOKUP; + spin_lock(&parent->d_lock); + new->d_parent = dget_dlock(parent); + hlist_add_head(&new->d_sib, &parent->d_children); + if (parent->d_flags & DCACHE_DISCONNECTED) + new->d_flags |= DCACHE_DISCONNECTED; + spin_unlock(&parent->d_lock); + +retry: + rcu_read_lock(); + seq = smp_load_acquire(&parent->d_inode->i_dir_seq); + r_seq = read_seqbegin(&rename_lock); + dentry = __d_lookup_rcu(parent, name, &d_seq); + if (unlikely(dentry)) { + if (!lockref_get_not_dead(&dentry->d_lockref)) { + rcu_read_unlock(); + goto retry; + } + if (read_seqcount_retry(&dentry->d_seq, d_seq)) { + rcu_read_unlock(); + dput(dentry); + goto retry; + } + rcu_read_unlock(); + dput(new); + return dentry; + } + if (unlikely(read_seqretry(&rename_lock, r_seq))) { + rcu_read_unlock(); + goto retry; + } + + if (unlikely(seq & 1)) { + rcu_read_unlock(); + goto retry; + } + + hlist_bl_lock(b); + if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { + hlist_bl_unlock(b); + rcu_read_unlock(); + goto retry; + } + /* + * No changes for the parent since the beginning of d_lookup(). + * Since all removals from the chain happen with hlist_bl_lock(), + * any potential in-lookup matches are going to stay here until + * we unlock the chain. All fields are stable in everything + * we encounter. + */ + hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { + if (dentry->d_name.hash != hash) + continue; + if (dentry->d_parent != parent) + continue; + if (!d_same_name(dentry, parent, name)) + continue; + hlist_bl_unlock(b); + /* now we can try to grab a reference */ + if (!lockref_get_not_dead(&dentry->d_lockref)) { + rcu_read_unlock(); + goto retry; + } + + rcu_read_unlock(); + /* + * somebody is likely to be still doing lookup for it; + * wait for them to finish + */ + spin_lock(&dentry->d_lock); + d_wait_lookup(dentry); + /* + * it's not in-lookup anymore; in principle we should repeat + * everything from dcache lookup, but it's likely to be what + * d_lookup() would've found anyway. If it is, just return it; + * otherwise we really have to repeat the whole thing. + */ + if (unlikely(dentry->d_name.hash != hash)) + goto mismatch; + if (unlikely(dentry->d_parent != parent)) + goto mismatch; + if (unlikely(d_unhashed(dentry))) + goto mismatch; + if (unlikely(!d_same_name(dentry, parent, name))) + goto mismatch; + /* OK, it *is* a hashed match; return it */ + spin_unlock(&dentry->d_lock); + dput(new); + return dentry; + } + rcu_read_unlock(); + new->d_wait = wq; + hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); + hlist_bl_unlock(b); + return new; +mismatch: + spin_unlock(&dentry->d_lock); + dput(dentry); + goto retry; +} +EXPORT_SYMBOL(d_alloc_parallel); + +/* + * - Unhash the dentry + * - Retrieve and clear the waitqueue head in dentry + * - Return the waitqueue head + */ +static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) +{ + wait_queue_head_t *d_wait; + struct hlist_bl_head *b; + + lockdep_assert_held(&dentry->d_lock); + + b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); + hlist_bl_lock(b); + dentry->d_flags &= ~DCACHE_PAR_LOOKUP; + __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); + d_wait = dentry->d_wait; + dentry->d_wait = NULL; + hlist_bl_unlock(b); + INIT_HLIST_NODE(&dentry->d_u.d_alias); + INIT_LIST_HEAD(&dentry->d_lru); + return d_wait; +} + +void __d_lookup_unhash_wake(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + wake_up_all(__d_lookup_unhash(dentry)); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(__d_lookup_unhash_wake); + +/* inode->i_lock held if inode is non-NULL */ + +static inline void __d_add(struct dentry *dentry, struct inode *inode, + const struct dentry_operations *ops) +{ + wait_queue_head_t *d_wait; + struct inode *dir = NULL; + unsigned n; + spin_lock(&dentry->d_lock); + if (unlikely(d_in_lookup(dentry))) { + dir = dentry->d_parent->d_inode; + n = start_dir_add(dir); + d_wait = __d_lookup_unhash(dentry); + } + if (unlikely(ops)) + d_set_d_op(dentry, ops); + if (inode) { + unsigned add_flags = d_flags_for_inode(inode); + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + raw_write_seqcount_begin(&dentry->d_seq); + __d_set_inode_and_type(dentry, inode, add_flags); + raw_write_seqcount_end(&dentry->d_seq); + fsnotify_update_flags(dentry); + } + __d_rehash(dentry); + if (dir) + end_dir_add(dir, n, d_wait); + spin_unlock(&dentry->d_lock); + if (inode) + spin_unlock(&inode->i_lock); +} + /** - * dentry_update_name_case - update case insensitive dentry with a new name - * @dentry: dentry to be updated - * @name: new name - * - * Update a case insensitive dentry with new case of name. + * d_add - add dentry to hash queues + * @entry: dentry to add + * @inode: The inode to attach to this dentry * - * dentry must have been returned by d_lookup with name @name. Old and new - * name lengths must match (ie. no d_compare which allows mismatched name - * lengths). - * - * Parent inode i_mutex must be held over d_lookup and into this call (to - * keep renames and concurrent inserts, and readdir(2) away). + * This adds the entry to the hash queues and initializes @inode. + * The entry was actually filled in earlier during d_alloc(). */ -void dentry_update_name_case(struct dentry *dentry, struct qstr *name) + +void d_add(struct dentry *entry, struct inode *inode) { - BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex)); - BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ + if (inode) { + security_d_instantiate(entry, inode); + spin_lock(&inode->i_lock); + } + __d_add(entry, inode, NULL); +} +EXPORT_SYMBOL(d_add); +struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode) +{ + WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + WARN_ON(!inode); + security_d_instantiate(dentry, inode); + spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); - write_seqcount_begin(&dentry->d_seq); - memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); - write_seqcount_end(&dentry->d_seq); + __d_instantiate(dentry, inode); + dentry->d_flags |= DCACHE_PERSISTENT; + dget_dlock(dentry); + if (d_unhashed(dentry)) + __d_rehash(dentry); spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + return dentry; } -EXPORT_SYMBOL(dentry_update_name_case); +EXPORT_SYMBOL(d_make_persistent); -static void switch_names(struct dentry *dentry, struct dentry *target) +static void swap_names(struct dentry *dentry, struct dentry *target) { - if (dname_external(target)) { - if (dname_external(dentry)) { + if (unlikely(dname_external(target))) { + if (unlikely(dname_external(dentry))) { /* * Both external: swap the pointers */ - swap(target->d_name.name, dentry->d_name.name); + swap(target->__d_name.name, dentry->__d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ - memcpy(target->d_iname, dentry->d_name.name, - dentry->d_name.len + 1); - dentry->d_name.name = target->d_name.name; - target->d_name.name = target->d_iname; + dentry->__d_name.name = target->__d_name.name; + target->d_shortname = dentry->d_shortname; + target->__d_name.name = target->d_shortname.string; } } else { - if (dname_external(dentry)) { + if (unlikely(dname_external(dentry))) { /* * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ - memcpy(dentry->d_iname, target->d_name.name, - target->d_name.len + 1); - target->d_name.name = dentry->d_name.name; - dentry->d_name.name = dentry->d_iname; + target->__d_name.name = dentry->__d_name.name; + dentry->d_shortname = target->d_shortname; + dentry->__d_name.name = dentry->d_shortname.string; } else { /* - * Both are internal. Just copy target to dentry + * Both are internal. */ - memcpy(dentry->d_iname, target->d_name.name, - target->d_name.len + 1); - dentry->d_name.len = target->d_name.len; - return; + for (int i = 0; i < DNAME_INLINE_WORDS; i++) + swap(dentry->d_shortname.words[i], + target->d_shortname.words[i]); } } - swap(dentry->d_name.len, target->d_name.len); + swap(dentry->__d_name.hash_len, target->__d_name.hash_len); } -static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target) +static void copy_name(struct dentry *dentry, struct dentry *target) { - /* - * XXXX: do we really need to take target->d_lock? - */ - if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent) - spin_lock(&target->d_parent->d_lock); - else { - if (d_ancestor(dentry->d_parent, target->d_parent)) { - spin_lock(&dentry->d_parent->d_lock); - spin_lock_nested(&target->d_parent->d_lock, - DENTRY_D_LOCK_NESTED); - } else { - spin_lock(&target->d_parent->d_lock); - spin_lock_nested(&dentry->d_parent->d_lock, - DENTRY_D_LOCK_NESTED); - } - } - if (target < dentry) { - spin_lock_nested(&target->d_lock, 2); - spin_lock_nested(&dentry->d_lock, 3); + struct external_name *old_name = NULL; + if (unlikely(dname_external(dentry))) + old_name = external_name(dentry); + if (unlikely(dname_external(target))) { + atomic_inc(&external_name(target)->count); + dentry->__d_name = target->__d_name; } else { - spin_lock_nested(&dentry->d_lock, 2); - spin_lock_nested(&target->d_lock, 3); + dentry->d_shortname = target->d_shortname; + dentry->__d_name.name = dentry->d_shortname.string; + dentry->__d_name.hash_len = target->__d_name.hash_len; } + if (old_name && likely(atomic_dec_and_test(&old_name->count))) + kfree_rcu(old_name, head); } -static void dentry_unlock_parents_for_move(struct dentry *dentry, - struct dentry *target) -{ - if (target->d_parent != dentry->d_parent) - spin_unlock(&dentry->d_parent->d_lock); - if (target->d_parent != target) - spin_unlock(&target->d_parent->d_lock); -} - -/* - * When switching names, the actual string doesn't strictly have to - * be preserved in the target - because we're dropping the target - * anyway. As such, we can just do a simple memcpy() to copy over - * the new name before we switch. - * - * Note that we have to be a lot more careful about getting the hash - * switched - we have to switch the hash value properly even if it - * then no longer matches the actual (corrupted) string of the target. - * The hash value has to match the hash queue that the dentry is on.. - */ /* * __d_move - move a dentry * @dentry: entry to move * @target: new dentry + * @exchange: exchange the two dentries * - * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. Caller must hold - * rename_lock, the i_mutex of the source and target directories, - * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). + * Update the dcache to reflect the move of a file name. Negative dcache + * entries should not be moved in this way. Caller must hold rename_lock, the + * i_rwsem of the source and target directories (exclusively), and the sb-> + * s_vfs_rename_mutex if they differ. See lock_rename(). */ -static void __d_move(struct dentry * dentry, struct dentry * target) +static void __d_move(struct dentry *dentry, struct dentry *target, + bool exchange) { - if (!dentry->d_inode) - printk(KERN_WARNING "VFS: moving negative dcache entry\n"); - - BUG_ON(d_ancestor(dentry, target)); - BUG_ON(d_ancestor(target, dentry)); - - dentry_lock_for_move(dentry, target); - - write_seqcount_begin(&dentry->d_seq); - write_seqcount_begin(&target->d_seq); + struct dentry *old_parent, *p; + wait_queue_head_t *d_wait; + struct inode *dir = NULL; + unsigned n; - /* __d_drop does write_seqcount_barrier, but they're OK to nest. */ - - /* - * Move the dentry to the target hash queue. Don't bother checking - * for the same hash queue because of how unlikely it is. - */ - __d_drop(dentry); - __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); - - /* Unhash the target: dput() will then get rid of it */ - __d_drop(target); - - list_del(&dentry->d_u.d_child); - list_del(&target->d_u.d_child); - - /* Switch the names.. */ - switch_names(dentry, target); - swap(dentry->d_name.hash, target->d_name.hash); + WARN_ON(!dentry->d_inode); + if (WARN_ON(dentry == target)) + return; - /* ... and switch the parents */ + BUG_ON(d_ancestor(target, dentry)); + old_parent = dentry->d_parent; + p = d_ancestor(old_parent, target); if (IS_ROOT(dentry)) { - dentry->d_parent = target->d_parent; - target->d_parent = target; - INIT_LIST_HEAD(&target->d_u.d_child); + BUG_ON(p); + spin_lock(&target->d_parent->d_lock); + } else if (!p) { + /* target is not a descendent of dentry->d_parent */ + spin_lock(&target->d_parent->d_lock); + spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); } else { - swap(dentry->d_parent, target->d_parent); + BUG_ON(p == dentry); + spin_lock(&old_parent->d_lock); + if (p != target) + spin_lock_nested(&target->d_parent->d_lock, + DENTRY_D_LOCK_NESTED); + } + spin_lock_nested(&dentry->d_lock, 2); + spin_lock_nested(&target->d_lock, 3); - /* And add them back to the (new) parent lists */ - list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); + if (unlikely(d_in_lookup(target))) { + dir = target->d_parent->d_inode; + n = start_dir_add(dir); + d_wait = __d_lookup_unhash(target); } - list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); + write_seqcount_begin(&dentry->d_seq); + write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); + + /* unhash both */ + if (!d_unhashed(dentry)) + ___d_drop(dentry); + if (!d_unhashed(target)) + ___d_drop(target); + + /* ... and switch them in the tree */ + dentry->d_parent = target->d_parent; + if (!exchange) { + copy_name(dentry, target); + target->d_hash.pprev = NULL; + dentry->d_parent->d_lockref.count++; + if (dentry != old_parent) /* wasn't IS_ROOT */ + WARN_ON(!--old_parent->d_lockref.count); + } else { + target->d_parent = old_parent; + swap_names(dentry, target); + if (!hlist_unhashed(&target->d_sib)) + __hlist_del(&target->d_sib); + hlist_add_head(&target->d_sib, &target->d_parent->d_children); + __d_rehash(target); + fsnotify_update_flags(target); + } + if (!hlist_unhashed(&dentry->d_sib)) + __hlist_del(&dentry->d_sib); + hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children); + __d_rehash(dentry); + fsnotify_update_flags(dentry); + fscrypt_handle_d_move(dentry); write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); - dentry_unlock_parents_for_move(dentry, target); + if (dir) + end_dir_add(dir, n, d_wait); + + if (dentry->d_parent != old_parent) + spin_unlock(&dentry->d_parent->d_lock); + if (dentry != old_parent) + spin_unlock(&old_parent->d_lock); spin_unlock(&target->d_lock); - fsnotify_d_move(dentry); spin_unlock(&dentry->d_lock); } @@ -2311,11 +2964,31 @@ static void __d_move(struct dentry * dentry, struct dentry * target) void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); - __d_move(dentry, target); + __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); +/* + * d_exchange - exchange two dentries + * @dentry1: first dentry + * @dentry2: second dentry + */ +void d_exchange(struct dentry *dentry1, struct dentry *dentry2) +{ + write_seqlock(&rename_lock); + + WARN_ON(!dentry1->d_inode); + WARN_ON(!dentry2->d_inode); + WARN_ON(IS_ROOT(dentry1)); + WARN_ON(IS_ROOT(dentry2)); + + __d_move(dentry1, dentry2, true); + + write_sequnlock(&rename_lock); +} +EXPORT_SYMBOL(d_exchange); + /** * d_ancestor - search for an ancestor * @p1: ancestor dentry @@ -2339,16 +3012,16 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock + * dentry->d_parent->d_inode->i_rwsem, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ -static struct dentry *__d_unalias(struct inode *inode, - struct dentry *dentry, struct dentry *alias) +static int __d_unalias(struct dentry *dentry, struct dentry *alias) { - struct mutex *m1 = NULL, *m2 = NULL; - struct dentry *ret = ERR_PTR(-EBUSY); + struct mutex *m1 = NULL; + struct rw_semaphore *m2 = NULL; + int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) @@ -2358,509 +3031,104 @@ static struct dentry *__d_unalias(struct inode *inode, if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; - if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex)) + if (!inode_trylock_shared(alias->d_parent->d_inode)) goto out_err; - m2 = &alias->d_parent->d_inode->i_mutex; + m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: - if (likely(!d_mountpoint(alias))) { - __d_move(alias, dentry); - ret = alias; - } + if (alias->d_op && alias->d_op->d_unalias_trylock && + !alias->d_op->d_unalias_trylock(alias)) + goto out_err; + __d_move(alias, dentry, false); + if (alias->d_op && alias->d_op->d_unalias_unlock) + alias->d_op->d_unalias_unlock(alias); + ret = 0; out_err: - spin_unlock(&inode->i_lock); if (m2) - mutex_unlock(m2); + up_read(m2); if (m1) mutex_unlock(m1); return ret; } -/* - * Prepare an anonymous dentry for life in the superblock's dentry tree as a - * named dentry in place of the dentry to be replaced. - * returns with anon->d_lock held! - */ -static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) +struct dentry *d_splice_alias_ops(struct inode *inode, struct dentry *dentry, + const struct dentry_operations *ops) { - struct dentry *dparent; - - dentry_lock_for_move(anon, dentry); - - write_seqcount_begin(&dentry->d_seq); - write_seqcount_begin(&anon->d_seq); - - dparent = dentry->d_parent; - - switch_names(dentry, anon); - swap(dentry->d_name.hash, anon->d_name.hash); - - dentry->d_parent = dentry; - list_del_init(&dentry->d_u.d_child); - anon->d_parent = dparent; - list_move(&anon->d_u.d_child, &dparent->d_subdirs); - - write_seqcount_end(&dentry->d_seq); - write_seqcount_end(&anon->d_seq); - - dentry_unlock_parents_for_move(anon, dentry); - spin_unlock(&dentry->d_lock); - - /* anon->d_lock still locked, returns locked */ - anon->d_flags &= ~DCACHE_DISCONNECTED; -} - -/** - * d_materialise_unique - introduce an inode into the tree - * @dentry: candidate dentry - * @inode: inode to bind to the dentry, to which aliases may be attached - * - * Introduces an dentry into the tree, substituting an extant disconnected - * root directory alias in its place if there is one. Caller must hold the - * i_mutex of the parent directory. - */ -struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) -{ - struct dentry *actual; + if (IS_ERR(inode)) + return ERR_CAST(inode); BUG_ON(!d_unhashed(dentry)); - if (!inode) { - actual = dentry; - __d_instantiate(dentry, NULL); - d_rehash(actual); - goto out_nolock; - } + if (!inode) + goto out; + security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); - if (S_ISDIR(inode->i_mode)) { - struct dentry *alias; - - /* Does an aliased dentry already exist? */ - alias = __d_find_alias(inode, 0); - if (alias) { - actual = alias; + struct dentry *new = __d_find_any_alias(inode); + if (unlikely(new)) { + /* The reference to new ensures it remains an alias */ + spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); - - if (d_ancestor(alias, dentry)) { - /* Check for loops */ - actual = ERR_PTR(-ELOOP); - spin_unlock(&inode->i_lock); - } else if (IS_ROOT(alias)) { - /* Is this an anonymous mountpoint that we - * could splice into our tree? */ - __d_materialise_dentry(dentry, alias); + if (unlikely(d_ancestor(new, dentry))) { + write_sequnlock(&rename_lock); + dput(new); + new = ERR_PTR(-ELOOP); + pr_warn_ratelimited( + "VFS: Lookup of '%s' in %s %s" + " would have caused loop\n", + dentry->d_name.name, + inode->i_sb->s_type->name, + inode->i_sb->s_id); + } else if (!IS_ROOT(new)) { + struct dentry *old_parent = dget(new->d_parent); + int err = __d_unalias(dentry, new); write_sequnlock(&rename_lock); - __d_drop(alias); - goto found; + if (err) { + dput(new); + new = ERR_PTR(err); + } + dput(old_parent); } else { - /* Nope, but we must(!) avoid directory - * aliasing. This drops inode->i_lock */ - actual = __d_unalias(inode, dentry, alias); - } - write_sequnlock(&rename_lock); - if (IS_ERR(actual)) { - if (PTR_ERR(actual) == -ELOOP) - pr_warn_ratelimited( - "VFS: Lookup of '%s' in %s %s" - " would have caused loop\n", - dentry->d_name.name, - inode->i_sb->s_type->name, - inode->i_sb->s_id); - dput(alias); + __d_move(new, dentry, false); + write_sequnlock(&rename_lock); } - goto out_nolock; - } - } - - /* Add a unique reference */ - actual = __d_instantiate_unique(dentry, inode); - if (!actual) - actual = dentry; - else - BUG_ON(!d_unhashed(actual)); - - spin_lock(&actual->d_lock); -found: - _d_rehash(actual); - spin_unlock(&actual->d_lock); - spin_unlock(&inode->i_lock); -out_nolock: - if (actual == dentry) { - security_d_instantiate(dentry, inode); - return NULL; - } - - iput(inode); - return actual; -} -EXPORT_SYMBOL_GPL(d_materialise_unique); - -static int prepend(char **buffer, int *buflen, const char *str, int namelen) -{ - *buflen -= namelen; - if (*buflen < 0) - return -ENAMETOOLONG; - *buffer -= namelen; - memcpy(*buffer, str, namelen); - return 0; -} - -static int prepend_name(char **buffer, int *buflen, struct qstr *name) -{ - return prepend(buffer, buflen, name->name, name->len); -} - -/** - * prepend_path - Prepend path string to a buffer - * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry - * @buffer: pointer to the end of the buffer - * @buflen: pointer to buffer length - * - * Caller holds the rename_lock. - */ -static int prepend_path(const struct path *path, - const struct path *root, - char **buffer, int *buflen) -{ - struct dentry *dentry = path->dentry; - struct vfsmount *vfsmnt = path->mnt; - struct mount *mnt = real_mount(vfsmnt); - bool slash = false; - int error = 0; - - while (dentry != root->dentry || vfsmnt != root->mnt) { - struct dentry * parent; - - if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { - /* Global root? */ - if (!mnt_has_parent(mnt)) - goto global_root; - dentry = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - vfsmnt = &mnt->mnt; - continue; + iput(inode); + return new; } - parent = dentry->d_parent; - prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(buffer, buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (!error) - error = prepend(buffer, buflen, "/", 1); - if (error) - break; - - slash = true; - dentry = parent; } - - if (!error && !slash) - error = prepend(buffer, buflen, "/", 1); - - return error; - -global_root: - /* - * Filesystems needing to implement special "root names" - * should do so with ->d_dname() - */ - if (IS_ROOT(dentry) && - (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) { - WARN(1, "Root dentry has weird name <%.*s>\n", - (int) dentry->d_name.len, dentry->d_name.name); - } - if (!slash) - error = prepend(buffer, buflen, "/", 1); - if (!error) - error = is_mounted(vfsmnt) ? 1 : 2; - return error; +out: + __d_add(dentry, inode, ops); + return NULL; } /** - * __d_path - return the path of a dentry - * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry - * @buf: buffer to return value in - * @buflen: buffer length - * - * Convert a dentry into an ASCII path name. + * d_splice_alias - splice a disconnected dentry into the tree if one exists + * @inode: the inode which may have a disconnected dentry + * @dentry: a negative dentry which we want to point to the inode. * - * Returns a pointer into the buffer or an error code if the - * path was too long. + * If inode is a directory and has an IS_ROOT alias, then d_move that in + * place of the given dentry and return it, else simply d_add the inode + * to the dentry and return NULL. * - * "buflen" should be positive. + * If a non-IS_ROOT directory is found, the filesystem is corrupt, and + * we should error out: directories can't have multiple aliases. * - * If the path is not reachable from the supplied root, return %NULL. - */ -char *__d_path(const struct path *path, - const struct path *root, - char *buf, int buflen) -{ - char *res = buf + buflen; - int error; - - prepend(&res, &buflen, "\0", 1); - br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); - error = prepend_path(path, root, &res, &buflen); - write_sequnlock(&rename_lock); - br_read_unlock(&vfsmount_lock); - - if (error < 0) - return ERR_PTR(error); - if (error > 0) - return NULL; - return res; -} - -char *d_absolute_path(const struct path *path, - char *buf, int buflen) -{ - struct path root = {}; - char *res = buf + buflen; - int error; - - prepend(&res, &buflen, "\0", 1); - br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); - error = prepend_path(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); - br_read_unlock(&vfsmount_lock); - - if (error > 1) - error = -EINVAL; - if (error < 0) - return ERR_PTR(error); - return res; -} - -/* - * same as __d_path but appends "(deleted)" for unlinked files. - */ -static int path_with_deleted(const struct path *path, - const struct path *root, - char **buf, int *buflen) -{ - prepend(buf, buflen, "\0", 1); - if (d_unlinked(path->dentry)) { - int error = prepend(buf, buflen, " (deleted)", 10); - if (error) - return error; - } - - return prepend_path(path, root, buf, buflen); -} - -static int prepend_unreachable(char **buffer, int *buflen) -{ - return prepend(buffer, buflen, "(unreachable)", 13); -} - -/** - * d_path - return the path of a dentry - * @path: path to report - * @buf: buffer to return value in - * @buflen: buffer length - * - * Convert a dentry into an ASCII path name. If the entry has been deleted - * the string " (deleted)" is appended. Note that this is ambiguous. + * This is needed in the lookup routine of any filesystem that is exportable + * (via knfsd) so that we can build dcache paths to directories effectively. * - * Returns a pointer into the buffer or an error code if the path was - * too long. Note: Callers should use the returned pointer, not the passed - * in buffer, to use the name! The implementation often starts at an offset - * into the buffer, and may leave 0 bytes at the start. + * If a dentry was found and moved, then it is returned. Otherwise NULL + * is returned. This matches the expected return value of ->lookup. * - * "buflen" should be positive. - */ -char *d_path(const struct path *path, char *buf, int buflen) -{ - char *res = buf + buflen; - struct path root; - int error; - - /* - * We have various synthetic filesystems that never get mounted. On - * these filesystems dentries are never used for lookup purposes, and - * thus don't need to be hashed. They also don't need a name until a - * user wants to identify the object in /proc/pid/fd/. The little hack - * below allows us to generate a name for these objects on demand: - */ - if (path->dentry->d_op && path->dentry->d_op->d_dname) - return path->dentry->d_op->d_dname(path->dentry, buf, buflen); - - get_fs_root(current->fs, &root); - br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); - error = path_with_deleted(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); - br_read_unlock(&vfsmount_lock); - if (error < 0) - res = ERR_PTR(error); - path_put(&root); - return res; -} -EXPORT_SYMBOL(d_path); - -/* - * Helper function for dentry_operations.d_dname() members - */ -char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, - const char *fmt, ...) -{ - va_list args; - char temp[64]; - int sz; - - va_start(args, fmt); - sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1; - va_end(args); - - if (sz > sizeof(temp) || sz > buflen) - return ERR_PTR(-ENAMETOOLONG); - - buffer += buflen - sz; - return memcpy(buffer, temp, sz); -} - -/* - * Write full pathname from the root of the filesystem into the buffer. + * Cluster filesystems may call this function with a negative, hashed dentry. + * In that case, we know that the inode will be a regular file, and also this + * will only occur during atomic_open. So we need to check for the dentry + * being already hashed only in the final case. */ -static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) -{ - char *end = buf + buflen; - char *retval; - - prepend(&end, &buflen, "\0", 1); - if (buflen < 1) - goto Elong; - /* Get '/' right */ - retval = end-1; - *retval = '/'; - - while (!IS_ROOT(dentry)) { - struct dentry *parent = dentry->d_parent; - int error; - - prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(&end, &buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (error != 0 || prepend(&end, &buflen, "/", 1) != 0) - goto Elong; - - retval = end; - dentry = parent; - } - return retval; -Elong: - return ERR_PTR(-ENAMETOOLONG); -} - -char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) -{ - char *retval; - - write_seqlock(&rename_lock); - retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); - - return retval; -} -EXPORT_SYMBOL(dentry_path_raw); - -char *dentry_path(struct dentry *dentry, char *buf, int buflen) +struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { - char *p = NULL; - char *retval; - - write_seqlock(&rename_lock); - if (d_unlinked(dentry)) { - p = buf + buflen; - if (prepend(&p, &buflen, "//deleted", 10) != 0) - goto Elong; - buflen++; - } - retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); - if (!IS_ERR(retval) && p) - *p = '/'; /* restore '/' overriden with '\0' */ - return retval; -Elong: - return ERR_PTR(-ENAMETOOLONG); -} - -/* - * NOTE! The user-level library version returns a - * character pointer. The kernel system call just - * returns the length of the buffer filled (which - * includes the ending '\0' character), or a negative - * error value. So libc would do something like - * - * char *getcwd(char * buf, size_t size) - * { - * int retval; - * - * retval = sys_getcwd(buf, size); - * if (retval >= 0) - * return buf; - * errno = -retval; - * return NULL; - * } - */ -SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) -{ - int error; - struct path pwd, root; - char *page = (char *) __get_free_page(GFP_USER); - - if (!page) - return -ENOMEM; - - get_fs_root_and_pwd(current->fs, &root, &pwd); - - error = -ENOENT; - br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); - if (!d_unlinked(pwd.dentry)) { - unsigned long len; - char *cwd = page + PAGE_SIZE; - int buflen = PAGE_SIZE; - - prepend(&cwd, &buflen, "\0", 1); - error = prepend_path(&pwd, &root, &cwd, &buflen); - write_sequnlock(&rename_lock); - br_read_unlock(&vfsmount_lock); - - if (error < 0) - goto out; - - /* Unreachable from current root */ - if (error > 0) { - error = prepend_unreachable(&cwd, &buflen); - if (error) - goto out; - } - - error = -ERANGE; - len = PAGE_SIZE + page - cwd; - if (len <= size) { - error = len; - if (copy_to_user(buf, cwd, len)) - error = -EFAULT; - } - } else { - write_sequnlock(&rename_lock); - br_read_unlock(&vfsmount_lock); - } - -out: - path_put(&pwd); - path_put(&root); - free_page((unsigned long) page); - return error; + return d_splice_alias_ops(inode, dentry, NULL); } +EXPORT_SYMBOL(d_splice_alias); /* * Test whether new_dentry is a subdirectory of old_dentry. @@ -2873,117 +3141,89 @@ out: * @new_dentry: new dentry * @old_dentry: old dentry * - * Returns 1 if new_dentry is a subdirectory of the parent (at any depth). - * Returns 0 otherwise. + * Returns true if new_dentry is a subdirectory of the parent (at any depth). + * Returns false otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ -int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) +bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { - int result; + bool subdir; unsigned seq; if (new_dentry == old_dentry) - return 1; - - do { - /* for restarting inner loop in case of seq retry */ - seq = read_seqbegin(&rename_lock); - /* - * Need rcu_readlock to protect against the d_parent trashing - * due to d_move - */ - rcu_read_lock(); - if (d_ancestor(old_dentry, new_dentry)) - result = 1; - else - result = 0; - rcu_read_unlock(); - } while (read_seqretry(&rename_lock, seq)); - - return result; -} - -void d_genocide(struct dentry *root) -{ - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int locked = 0; + return true; + /* Access d_parent under rcu as d_move() may change it. */ + rcu_read_lock(); seq = read_seqbegin(&rename_lock); -again: - this_parent = root; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; - - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (d_unhashed(dentry) || !dentry->d_inode) { - spin_unlock(&dentry->d_lock); - continue; - } - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } - if (!(dentry->d_flags & DCACHE_GENOCIDE)) { - dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_count--; - } - spin_unlock(&dentry->d_lock); + subdir = d_ancestor(old_dentry, new_dentry); + /* Try lockless once... */ + if (read_seqretry(&rename_lock, seq)) { + /* ...else acquire lock for progress even on deep chains. */ + read_seqlock_excl(&rename_lock); + subdir = d_ancestor(old_dentry, new_dentry); + read_sequnlock_excl(&rename_lock); } - if (this_parent != root) { - struct dentry *child = this_parent; - if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { - this_parent->d_flags |= DCACHE_GENOCIDE; - this_parent->d_count--; - } - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; - } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return; - -rename_retry: - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; + rcu_read_unlock(); + return subdir; } +EXPORT_SYMBOL(is_subdir); -void d_tmpfile(struct dentry *dentry, struct inode *inode) +void d_mark_tmpfile(struct file *file, struct inode *inode) { - inode_dec_link_count(inode); - BUG_ON(dentry->d_name.name != dentry->d_iname || - !hlist_unhashed(&dentry->d_alias) || + struct dentry *dentry = file->f_path.dentry; + + BUG_ON(dname_external(dentry) || + !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", + dentry->__d_name.len = sprintf(dentry->d_shortname.string, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); +} +EXPORT_SYMBOL(d_mark_tmpfile); + +void d_tmpfile(struct file *file, struct inode *inode) +{ + struct dentry *dentry = file->f_path.dentry; + + inode_dec_link_count(inode); + d_mark_tmpfile(file, inode); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); +/* + * Obtain inode number of the parent dentry. + */ +ino_t d_parent_ino(struct dentry *dentry) +{ + struct dentry *parent; + struct inode *iparent; + unsigned seq; + ino_t ret; + + scoped_guard(rcu) { + seq = raw_seqcount_begin(&dentry->d_seq); + parent = READ_ONCE(dentry->d_parent); + iparent = d_inode_rcu(parent); + if (likely(iparent)) { + ret = iparent->i_ino; + if (!read_seqcount_retry(&dentry->d_seq, seq)) + return ret; + } + } + + spin_lock(&dentry->d_lock); + ret = dentry->d_parent->d_inode->i_ino; + spin_unlock(&dentry->d_lock); + return ret; +} +EXPORT_SYMBOL(d_parent_ino); + static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { @@ -2996,8 +3236,6 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -3009,27 +3247,28 @@ static void __init dcache_init_early(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &d_hash_shift, - &d_hash_mask, + NULL, 0, 0); + d_hash_shift = 32 - d_hash_shift; - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); + runtime_const_init(shift, d_hash_shift); + runtime_const_init(ptr, dentry_hashtable); } static void __init dcache_init(void) { - unsigned int loop; - - /* + /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature - * of the dcache. + * of the dcache. */ - dentry_cache = KMEM_CACHE(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + __dentry_cache = KMEM_CACHE_USERCOPY(dentry, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, + d_shortname.string); + runtime_const_init(ptr, __dentry_cache); /* Hash may have been set up in dcache_init_early */ if (!hashdist) @@ -3040,44 +3279,41 @@ static void __init dcache_init(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - 0, + HASH_ZERO, &d_hash_shift, - &d_hash_mask, + NULL, 0, 0); + d_hash_shift = 32 - d_hash_shift; - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); + runtime_const_init(shift, d_hash_shift); + runtime_const_init(ptr, dentry_hashtable); } /* SLAB cache for __getname() consumers */ -struct kmem_cache *names_cachep __read_mostly; +struct kmem_cache *names_cachep __ro_after_init; EXPORT_SYMBOL(names_cachep); -EXPORT_SYMBOL(d_genocide); - void __init vfs_caches_init_early(void) { + int i; + + for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) + INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); + dcache_init_early(); inode_init_early(); } -void __init vfs_caches_init(unsigned long mempages) +void __init vfs_caches_init(void) { - unsigned long reserve; - - /* Base hash sizes on available memory, with a reserve equal to - 150% of current kernel size */ - - reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1); - mempages -= reserve; - - names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL); dcache_init(); inode_init(); - files_init(mempages); + files_init(); + files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); |
