From bee9182d955227f01ff3b80c4cb6acca9bb40b11 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 19 Jul 2015 23:48:20 +0200 Subject: introduce __sb_writers_{acquired,release}() helpers Preparation to hide the sb->s_writers internals from xfs and btrfs. Add 2 trivial define's they can use rather than play with ->s_writers directly. No changes in btrfs/transaction.o and xfs/xfs_aops.o. Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 84b783f277f7..acb7cad84edd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1391,6 +1391,11 @@ extern struct timespec current_fs_time(struct super_block *sb); void __sb_end_write(struct super_block *sb, int level); int __sb_start_write(struct super_block *sb, int level, bool wait); +#define __sb_writers_acquired(sb, lev) \ + rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_) +#define __sb_writers_release(sb, lev) \ + rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_) + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to -- cgit From 9287f6925ad9d8fb8c6283066b4f77fd87f123a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 17:45:57 +0200 Subject: percpu-rwsem: introduce percpu_down_read_trylock() Add percpu_down_read_trylock(), it will have the user soon. Signed-off-by: Oleg Nesterov --- include/linux/percpu-rwsem.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 3e88c9a7d57f..16c30cd33501 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -16,6 +16,7 @@ struct percpu_rw_semaphore { }; extern void percpu_down_read(struct percpu_rw_semaphore *); +extern int percpu_down_read_trylock(struct percpu_rw_semaphore *); extern void percpu_up_read(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); -- cgit From 55cc156505f2e43fa45dbd4bfe8f9c9d848ca44c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 20:26:44 +0200 Subject: percpu-rwsem: introduce percpu_rwsem_release() and percpu_rwsem_acquire() Add percpu_rwsem_release() and percpu_rwsem_acquire() for the users which need to return to userspace with percpu-rwsem lock held and/or pass the ownership to another thread. TODO: change percpu_rwsem_release() to use rwsem_clear_owner(). We can either fold kernel/locking/rwsem.h into include/linux/rwsem.h, or add the non-inline percpu_rwsem_clear_owner(). Signed-off-by: Oleg Nesterov --- include/linux/percpu-rwsem.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 16c30cd33501..834c4e52cb2d 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -32,4 +32,23 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *); __percpu_init_rwsem(brw, #brw, &rwsem_key); \ }) + +#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) + +static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) +{ + lock_release(&sem->rw_sem.dep_map, 1, ip); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + if (!read) + sem->rw_sem.owner = NULL; +#endif +} + +static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) +{ + lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip); +} + #endif -- cgit From 853b39a7c82826b8413048feec7bf08e98ce7a84 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Jul 2015 20:21:13 +0200 Subject: shift percpu_counter_destroy() into destroy_super_work() Of course, this patch is ugly as hell. It will be (partially) reverted later. We add it to ensure that other WIP changes in percpu_rw_semaphore won't break fs/super.c. We do not even need this change right now, percpu_free_rwsem() is fine in atomic context. But we are going to change this, it will be might_sleep() after we merge the rcu_sync() patches. And even after that we do not really need destroy_super_work(), we will kill it in any case. Instead, destroy_super_rcu() should just check that rss->cb_state == CB_IDLE and do call_rcu() again in the (very unlikely) case this is not true. So this is just the temporary kludge which helps us to avoid the conflicts with the changes which will be (hopefully) routed via rcu tree. Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index acb7cad84edd..4bed78966c6b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1375,7 +1376,7 @@ struct super_block { struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; - + struct work_struct destroy_work; /* * Indicates how deep in a filesystem stack this SB is */ -- cgit From 8129ed29644bf56ed17ec1bbbeed5c568b43d6a0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 11 Aug 2015 17:05:04 +0200 Subject: change sb_writers to use percpu_rw_semaphore We can remove everything from struct sb_writers except frozen and add the array of percpu_rw_semaphore's instead. This patch doesn't remove sb_writers->wait_unfrozen yet, we keep it for get_super_thawed(). We will probably remove it later. This change tries to address the following problems: - Firstly, __sb_start_write() looks simply buggy. It does __sb_end_write() if it sees ->frozen, but if it migrates to another CPU before percpu_counter_dec(), sb_wait_write() can wrongly succeed if there is another task which holds the same "semaphore": sb_wait_write() can miss the result of the previous percpu_counter_inc() but see the result of this percpu_counter_dec(). - As Dave Hansen reports, it is suboptimal. The trivial microbenchmark that writes to a tmpfs file in a loop runs 12% faster if we change this code to rely on RCU and kill the memory barriers. - This code doesn't look simple. It would be better to rely on the generic locking code. According to Dave, this change adds the same performance improvement. Note: with this change both freeze_super() and thaw_super() will do synchronize_sched_expedited() 3 times. This is just ugly. But: - This will be "fixed" by the rcu_sync changes we are going to merge. After that freeze_super()->percpu_down_write() will use synchronize_sched(), and thaw_super() won't use synchronize() at all. This doesn't need any changes in fs/super.c. - Once we merge rcu_sync changes, we can also change super.c so that all wb_write->rw_sem's will share the single ->rss in struct sb_writes, then freeze_super() will need only one synchronize_sched(). Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- include/linux/fs.h | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 4bed78966c6b..ce356f66cc2a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1,7 +1,6 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H - #include #include #include @@ -31,6 +30,7 @@ #include #include #include +#include #include #include @@ -1275,16 +1275,9 @@ enum { #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { - /* Counters for counting writers at each level */ - struct percpu_counter counter[SB_FREEZE_LEVELS]; - wait_queue_head_t wait; /* queue for waiting for - writers / faults to finish */ - int frozen; /* Is sb frozen? */ - wait_queue_head_t wait_unfrozen; /* queue for waiting for - sb to be thawed */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map lock_map[SB_FREEZE_LEVELS]; -#endif + int frozen; /* Is sb frozen? */ + wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */ + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { @@ -1393,9 +1386,9 @@ void __sb_end_write(struct super_block *sb, int level); int __sb_start_write(struct super_block *sb, int level, bool wait); #define __sb_writers_acquired(sb, lev) \ - rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_) + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ - rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_) + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) /** * sb_end_write - drop write access to a superblock -- cgit From cbedaac63481dea52327127a9f1c60f092bd6b07 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 12 Mar 2015 08:19:11 -0400 Subject: inode: add hlist_fake to avoid the inode hash lock in evict Some filesystems don't use the VFS inode hash and fake the fact they are hashed so that all the writeback code works correctly. However, this means the evict() path still tries to remove the inode from the hash, meaning that the inode_hash_lock() needs to be taken unnecessarily. Hence under certain workloads the inode_hash_lock can be contended even if the inode is never actually hashed. To avoid this add hlist_fake to test if the inode isn't actually hashed to avoid taking the hash lock on inodes that have never been hashed. Based on Dave Chinner's inode: add IOP_NOTHASHED to avoid inode hash lock in evict basd on Al's suggestions. Thanks, Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- include/linux/fs.h | 2 +- include/linux/list.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 84b783f277f7..4a40fa843040 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2608,7 +2608,7 @@ static inline void insert_inode_hash(struct inode *inode) extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { - if (!inode_unhashed(inode)) + if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } diff --git a/include/linux/list.h b/include/linux/list.h index feb773c76ee0..3e3e64a61002 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -672,6 +672,11 @@ static inline void hlist_add_fake(struct hlist_node *n) n->pprev = &n->next; } +static inline bool hlist_fake(struct hlist_node *h) +{ + return h->pprev == &h->next; +} + /* * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. -- cgit From 74278da9f70d84d715601fe794567a6d2bfdf078 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 Mar 2015 12:37:22 -0500 Subject: inode: convert inode_sb_list_lock to per-sb The process of reducing contention on per-superblock inode lists starts with moving the locking to match the per-superblock inode list. This takes the global lock out of the picture and reduces the contention problems to within a single filesystem. This doesn't get rid of contention as the locks still have global CPU scope, but it does isolate operations on different superblocks form each other. Signed-off-by: Dave Chinner Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- include/linux/fs.h | 5 ++++- include/linux/fsnotify_backend.h | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 4a40fa843040..09bbd38485f9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1309,7 +1309,6 @@ struct super_block { #endif const struct xattr_handler **s_xattr; - struct list_head s_inodes; /* all inodes */ struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; @@ -1380,6 +1379,10 @@ struct super_block { * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; + + /* s_inode_list_lock protects s_inodes */ + spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; + struct list_head s_inodes; /* all inodes */ }; extern struct timespec current_fs_time(struct super_block *sb); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 65a517dd32f7..0390ee69c439 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -357,7 +357,7 @@ extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, un extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group); extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); -extern void fsnotify_unmount_inodes(struct list_head *list); +extern void fsnotify_unmount_inodes(struct super_block *sb); /* put here because inotify does some weird stuff when destroying watches */ extern void fsnotify_init_event(struct fsnotify_event *event, @@ -393,7 +393,7 @@ static inline u32 fsnotify_get_cookie(void) return 0; } -static inline void fsnotify_unmount_inodes(struct list_head *list) +static inline void fsnotify_unmount_inodes(struct super_block *sb) {} #endif /* CONFIG_FSNOTIFY */ -- cgit From e97fedb9ef9868ff24d588be781906cf7c1b59ae Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 Mar 2015 13:40:00 -0500 Subject: sync: serialise per-superblock sync operations When competing sync(2) calls walk the same filesystem, they need to walk the list of inodes on the superblock to find all the inodes that we need to wait for IO completion on. However, when multiple wait_sb_inodes() calls do this at the same time, they contend on the the inode_sb_list_lock and the contention causes system wide slowdowns. In effect, concurrent sync(2) calls can take longer and burn more CPU than if they were serialised. Stop the worst of the contention by adding a per-sb mutex to wrap around wait_sb_inodes() so that we only execute one sync(2) IO completion walk per superblock superblock at a time and hence avoid contention being triggered by concurrent sync(2) calls. Signed-off-by: Dave Chinner Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 09bbd38485f9..82dfc5519b4b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1375,6 +1375,8 @@ struct super_block { struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; + struct mutex s_sync_lock; /* sync serialisation lock */ + /* * Indicates how deep in a filesystem stack this SB is */ -- cgit From c7f5408493aeb01532927b2276316797a03ed6ee Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 Mar 2015 14:07:22 -0500 Subject: inode: rename i_wb_list to i_io_list There's a small consistency problem between the inode and writeback naming. Writeback calls the "for IO" inode queues b_io and b_more_io, but the inode calls these the "writeback list" or i_wb_list. This makes it hard to an new "under writeback" list to the inode, or call it an "under IO" list on the bdi because either way we'll have writeback on IO and IO on writeback and it'll just be confusing. I'm getting confused just writing this! So, rename the inode "for IO" list variable to i_io_list so we can add a new "writeback list" in a subsequent patch. Signed-off-by: Dave Chinner Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 82dfc5519b4b..34cfa60db678 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -636,7 +636,7 @@ struct inode { unsigned long dirtied_time_when; struct hlist_node i_hash; - struct list_head i_wb_list; /* backing dev IO list */ + struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ -- cgit