From 86c8749ede0c59e590de9267066932a26f1ce796 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:18 +1100 Subject: vfs: revert per-cpu nr_unused counters for dentry and inodes The nr_unused counters count the number of objects on an LRU, and as such they are synchronized with LRU object insertion and removal and scanning, and protected under the LRU lock. Making it per-cpu does not actually get any concurrency improvements because of this lock, and summing the counter is much slower, and incrementing/decrementing it costs more code size and is slower too. These counters should stay per-LRU, which currently means global. Signed-off-by: Nick Piggin --- fs/inode.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index ae2727ab0c3a..efc43979709f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -103,7 +103,6 @@ static DECLARE_RWSEM(iprune_sem); struct inodes_stat_t inodes_stat; static struct percpu_counter nr_inodes __cacheline_aligned_in_smp; -static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp; static struct kmem_cache *inode_cachep __read_mostly; @@ -114,7 +113,7 @@ static inline int get_nr_inodes(void) static inline int get_nr_inodes_unused(void) { - return percpu_counter_sum_positive(&nr_inodes_unused); + return inodes_stat.nr_unused; } int get_nr_dirty_inodes(void) @@ -132,7 +131,6 @@ int proc_nr_inodes(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); - inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_dointvec(table, write, buffer, lenp, ppos); } #endif @@ -335,7 +333,7 @@ static void inode_lru_list_add(struct inode *inode) { if (list_empty(&inode->i_lru)) { list_add(&inode->i_lru, &inode_lru); - percpu_counter_inc(&nr_inodes_unused); + inodes_stat.nr_unused++; } } @@ -343,7 +341,7 @@ static void inode_lru_list_del(struct inode *inode) { if (!list_empty(&inode->i_lru)) { list_del_init(&inode->i_lru); - percpu_counter_dec(&nr_inodes_unused); + inodes_stat.nr_unused--; } } @@ -513,7 +511,7 @@ void evict_inodes(struct super_block *sb) list_move(&inode->i_lru, &dispose); list_del_init(&inode->i_wb_list); if (!(inode->i_state & (I_DIRTY | I_SYNC))) - percpu_counter_dec(&nr_inodes_unused); + inodes_stat.nr_unused--; } spin_unlock(&inode_lock); @@ -554,7 +552,7 @@ int invalidate_inodes(struct super_block *sb) list_move(&inode->i_lru, &dispose); list_del_init(&inode->i_wb_list); if (!(inode->i_state & (I_DIRTY | I_SYNC))) - percpu_counter_dec(&nr_inodes_unused); + inodes_stat.nr_unused--; } spin_unlock(&inode_lock); @@ -616,7 +614,7 @@ static void prune_icache(int nr_to_scan) if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { list_del_init(&inode->i_lru); - percpu_counter_dec(&nr_inodes_unused); + inodes_stat.nr_unused--; continue; } @@ -650,7 +648,7 @@ static void prune_icache(int nr_to_scan) */ list_move(&inode->i_lru, &freeable); list_del_init(&inode->i_wb_list); - percpu_counter_dec(&nr_inodes_unused); + inodes_stat.nr_unused--; } if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); @@ -1649,7 +1647,6 @@ void __init inode_init(void) init_once); register_shrinker(&icache_shrinker); percpu_counter_init(&nr_inodes, 0); - percpu_counter_init(&nr_inodes_unused, 0); /* Hash may have been set up in inode_init_early */ if (!hashdist) -- cgit From 3e880fb5e4bb6a012035e3edd0586ee2817c2e24 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:19 +1100 Subject: fs: use fast counters for vfs caches percpu_counter library generates quite nasty code, so unless you need to dynamically allocate counters or take fast approximate value, a simple per cpu set of counters is much better. The percpu_counter can never be made to work as well, because it has an indirection from pointer to percpu memory, and it can't use direct this_cpu_inc interfaces because it doesn't use static PER_CPU data, so code will always be worse. In the fastpath, it is the difference between this: incl %gs:nr_dentry # nr_dentry and this: movl percpu_counter_batch(%rip), %edx # percpu_counter_batch, movl $1, %esi #, movq $nr_dentry, %rdi #, call __percpu_counter_add # (plus I clobber registers) __percpu_counter_add: pushq %rbp # movq %rsp, %rbp #, subq $32, %rsp #, movq %rbx, -24(%rbp) #, movq %r12, -16(%rbp) #, movq %r13, -8(%rbp) #, movq %rdi, %rbx # fbc, fbc #APP # 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1 movq %gs:kernel_stack,%rax #, pfo_ret__ # 0 "" 2 #NO_APP incl -8124(%rax) # .preempt_count movq 32(%rdi), %r12 # .counters, tcp_ptr__ #APP # 78 "lib/percpu_counter.c" 1 add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__ # 0 "" 2 #NO_APP movslq (%r12),%r13 #* tcp_ptr__, tmp73 movslq %edx,%rax # batch, batch addq %rsi, %r13 # amount, count cmpq %rax, %r13 # batch, count jge .L27 #, negl %edx # tmp76 movslq %edx,%rdx # tmp76, tmp77 cmpq %rdx, %r13 # tmp77, count jg .L28 #, .L27: movq %rbx, %rdi # fbc, call _raw_spin_lock # addq %r13, 8(%rbx) # count, .count movq %rbx, %rdi # fbc, movl $0, (%r12) #,* tcp_ptr__ call _raw_spin_unlock # .L29: #APP # 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1 movq %gs:kernel_stack,%rax #, pfo_ret__ # 0 "" 2 #NO_APP decl -8124(%rax) # .preempt_count movq -8136(%rax), %rax #, D.14625 testb $8, %al #, D.14625 jne .L32 #, .L31: movq -24(%rbp), %rbx #, movq -16(%rbp), %r12 #, movq -8(%rbp), %r13 #, leave ret .p2align 4,,10 .p2align 3 .L28: movl %r13d, (%r12) # count,* jmp .L29 # .L32: call preempt_schedule # .p2align 4,,6 jmp .L31 # .size __percpu_counter_add, .-__percpu_counter_add .p2align 4,,15 Signed-off-by: Nick Piggin --- fs/inode.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index efc43979709f..5a0a898f55d1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -102,13 +102,17 @@ static DECLARE_RWSEM(iprune_sem); */ struct inodes_stat_t inodes_stat; -static struct percpu_counter nr_inodes __cacheline_aligned_in_smp; +static DEFINE_PER_CPU(unsigned int, nr_inodes); static struct kmem_cache *inode_cachep __read_mostly; -static inline int get_nr_inodes(void) +static int get_nr_inodes(void) { - return percpu_counter_sum_positive(&nr_inodes); + int i; + int sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_inodes, i); + return sum < 0 ? 0 : sum; } static inline int get_nr_inodes_unused(void) @@ -118,9 +122,9 @@ static inline int get_nr_inodes_unused(void) int get_nr_dirty_inodes(void) { + /* not actually dirty inodes, but a wild approximation */ int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; - } /* @@ -222,7 +226,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif - percpu_counter_inc(&nr_inodes); + this_cpu_inc(nr_inodes); return 0; out: @@ -264,7 +268,7 @@ void __destroy_inode(struct inode *inode) if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_default_acl); #endif - percpu_counter_dec(&nr_inodes); + this_cpu_dec(nr_inodes); } EXPORT_SYMBOL(__destroy_inode); @@ -1646,7 +1650,6 @@ void __init inode_init(void) SLAB_MEM_SPREAD), init_once); register_shrinker(&icache_shrinker); - percpu_counter_init(&nr_inodes, 0); /* Hash may have been set up in inode_init_early */ if (!hashdist) -- cgit From fa0d7e3de6d6fc5004ad9dea0dd6b286af8f03e9 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:49 +1100 Subject: fs: icache RCU free inodes RCU free the struct inode. This will allow: - Subsequent store-free path walking patch. The inode must be consulted for permissions when walking, so an RCU inode reference is a must. - sb_inode_list_lock to be moved inside i_lock because sb list walkers who want to take i_lock no longer need to take sb_inode_list_lock to walk the list in the first place. This will simplify and optimize locking. - Could remove some nested trylock loops in dcache code - Could potentially simplify things a bit in VM land. Do not need to take the page lock to follow page->mapping. The downsides of this is the performance cost of using RCU. In a simple creat/unlink microbenchmark, performance drops by about 10% due to inability to reuse cache-hot slab objects. As iterations increase and RCU freeing starts kicking over, this increases to about 20%. In cases where inode lifetimes are longer (ie. many inodes may be allocated during the average life span of a single inode), a lot of this cache reuse is not applicable, so the regression caused by this patch is smaller. The cache-hot regression could largely be avoided by using SLAB_DESTROY_BY_RCU, however this adds some complexity to list walking and store-free path walking, so I prefer to implement this at a later date, if it is shown to be a win in real situations. I haven't found a regression in any non-micro benchmark so I doubt it will be a problem. Signed-off-by: Nick Piggin --- fs/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 5a0a898f55d1..6751dfe8cc06 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -272,6 +272,13 @@ void __destroy_inode(struct inode *inode) } EXPORT_SYMBOL(__destroy_inode); +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(inode_cachep, inode); +} + static void destroy_inode(struct inode *inode) { BUG_ON(!list_empty(&inode->i_lru)); @@ -279,7 +286,7 @@ static void destroy_inode(struct inode *inode) if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else - kmem_cache_free(inode_cachep, (inode)); + call_rcu(&inode->i_rcu, i_callback); } /* @@ -432,6 +439,7 @@ void end_writeback(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); inode_sync_wait(inode); + /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } EXPORT_SYMBOL(end_writeback); -- cgit From ff0c7d15f9787b7e8c601533c015295cc68329f8 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:50 +1100 Subject: fs: avoid inode RCU freeing for pseudo fs Pseudo filesystems that don't put inode on RCU list or reachable by rcu-walk dentries do not need to RCU free their inodes. Signed-off-by: Nick Piggin --- fs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 6751dfe8cc06..da85e56378f3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -257,6 +257,12 @@ static struct inode *alloc_inode(struct super_block *sb) return inode; } +void free_inode_nonrcu(struct inode *inode) +{ + kmem_cache_free(inode_cachep, inode); +} +EXPORT_SYMBOL(free_inode_nonrcu); + void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); -- cgit