From bee9182d955227f01ff3b80c4cb6acca9bb40b11 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 19 Jul 2015 23:48:20 +0200
Subject: introduce __sb_writers_{acquired,release}() helpers

Preparation to hide the sb->s_writers internals from xfs and btrfs.
Add 2 trivial define's they can use rather than play with ->s_writers
directly. No changes in btrfs/transaction.o and xfs/xfs_aops.o.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Jan Kara <jack@suse.com>
---
 include/linux/fs.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 84b783f277f7..acb7cad84edd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1391,6 +1391,11 @@ extern struct timespec current_fs_time(struct super_block *sb);
 void __sb_end_write(struct super_block *sb, int level);
 int __sb_start_write(struct super_block *sb, int level, bool wait);
 
+#define __sb_writers_acquired(sb, lev)	\
+	rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_)
+#define __sb_writers_release(sb, lev)	\
+	rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_)
+
 /**
  * sb_end_write - drop write access to a superblock
  * @sb: the super we wrote to
-- 
cgit 


From 9287f6925ad9d8fb8c6283066b4f77fd87f123a9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 17:45:57 +0200
Subject: percpu-rwsem: introduce percpu_down_read_trylock()

Add percpu_down_read_trylock(), it will have the user soon.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/percpu-rwsem.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 3e88c9a7d57f..16c30cd33501 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -16,6 +16,7 @@ struct percpu_rw_semaphore {
 };
 
 extern void percpu_down_read(struct percpu_rw_semaphore *);
+extern int  percpu_down_read_trylock(struct percpu_rw_semaphore *);
 extern void percpu_up_read(struct percpu_rw_semaphore *);
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
-- 
cgit 


From 55cc156505f2e43fa45dbd4bfe8f9c9d848ca44c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 20:26:44 +0200
Subject: percpu-rwsem: introduce percpu_rwsem_release() and
 percpu_rwsem_acquire()

Add percpu_rwsem_release() and percpu_rwsem_acquire() for the users
which need to return to userspace with percpu-rwsem lock held and/or
pass the ownership to another thread.

TODO: change percpu_rwsem_release() to use rwsem_clear_owner(). We can
either fold kernel/locking/rwsem.h into include/linux/rwsem.h, or add
the non-inline percpu_rwsem_clear_owner().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/percpu-rwsem.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 16c30cd33501..834c4e52cb2d 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -32,4 +32,23 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 	__percpu_init_rwsem(brw, #brw, &rwsem_key);		\
 })
 
+
+#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
+
+static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
+					bool read, unsigned long ip)
+{
+	lock_release(&sem->rw_sem.dep_map, 1, ip);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+	if (!read)
+		sem->rw_sem.owner = NULL;
+#endif
+}
+
+static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
+					bool read, unsigned long ip)
+{
+	lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
+}
+
 #endif
-- 
cgit 


From 853b39a7c82826b8413048feec7bf08e98ce7a84 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jul 2015 20:21:13 +0200
Subject: shift percpu_counter_destroy() into destroy_super_work()

Of course, this patch is ugly as hell. It will be (partially)
reverted later. We add it to ensure that other WIP changes in
percpu_rw_semaphore won't break fs/super.c.

We do not even need this change right now, percpu_free_rwsem()
is fine in atomic context. But we are going to change this, it
will be might_sleep() after we merge the rcu_sync() patches.

And even after that we do not really need destroy_super_work(),
we will kill it in any case. Instead, destroy_super_rcu() should
just check that rss->cb_state == CB_IDLE and do call_rcu() again
in the (very unlikely) case this is not true.

So this is just the temporary kludge which helps us to avoid the
conflicts with the changes which will be (hopefully) routed via
rcu tree.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Jan Kara <jack@suse.com>
---
 include/linux/fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index acb7cad84edd..4bed78966c6b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -30,6 +30,7 @@
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/blk_types.h>
+#include <linux/workqueue.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1375,7 +1376,7 @@ struct super_block {
 	struct list_lru		s_dentry_lru ____cacheline_aligned_in_smp;
 	struct list_lru		s_inode_lru ____cacheline_aligned_in_smp;
 	struct rcu_head		rcu;
-
+	struct work_struct	destroy_work;
 	/*
 	 * Indicates how deep in a filesystem stack this SB is
 	 */
-- 
cgit 


From 8129ed29644bf56ed17ec1bbbeed5c568b43d6a0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 11 Aug 2015 17:05:04 +0200
Subject: change sb_writers to use percpu_rw_semaphore

We can remove everything from struct sb_writers except frozen
and add the array of percpu_rw_semaphore's instead.

This patch doesn't remove sb_writers->wait_unfrozen yet, we keep
it for get_super_thawed(). We will probably remove it later.

This change tries to address the following problems:

	- Firstly, __sb_start_write() looks simply buggy. It does
	  __sb_end_write() if it sees ->frozen, but if it migrates
	  to another CPU before percpu_counter_dec(), sb_wait_write()
	  can wrongly succeed if there is another task which holds
	  the same "semaphore": sb_wait_write() can miss the result
	  of the previous percpu_counter_inc() but see the result
	  of this percpu_counter_dec().

	- As Dave Hansen reports, it is suboptimal. The trivial
	  microbenchmark that writes to a tmpfs file in a loop runs
	  12% faster if we change this code to rely on RCU and kill
	  the memory barriers.

	- This code doesn't look simple. It would be better to rely
	  on the generic locking code.

	  According to Dave, this change adds the same performance
	  improvement.

Note: with this change both freeze_super() and thaw_super() will do
synchronize_sched_expedited() 3 times. This is just ugly. But:

	- This will be "fixed" by the rcu_sync changes we are going
	  to merge. After that freeze_super()->percpu_down_write()
	  will use synchronize_sched(), and thaw_super() won't use
	  synchronize() at all.

	  This doesn't need any changes in fs/super.c.

	- Once we merge rcu_sync changes, we can also change super.c
	  so that all wb_write->rw_sem's will share the single ->rss
	  in struct sb_writes, then freeze_super() will need only one
	  synchronize_sched().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Jan Kara <jack@suse.com>
---
 include/linux/fs.h | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4bed78966c6b..ce356f66cc2a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_FS_H
 #define _LINUX_FS_H
 
-
 #include <linux/linkage.h>
 #include <linux/wait.h>
 #include <linux/kdev_t.h>
@@ -31,6 +30,7 @@
 #include <linux/percpu-rwsem.h>
 #include <linux/blk_types.h>
 #include <linux/workqueue.h>
+#include <linux/percpu-rwsem.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1275,16 +1275,9 @@ enum {
 #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
 
 struct sb_writers {
-	/* Counters for counting writers at each level */
-	struct percpu_counter	counter[SB_FREEZE_LEVELS];
-	wait_queue_head_t	wait;		/* queue for waiting for
-						   writers / faults to finish */
-	int			frozen;		/* Is sb frozen? */
-	wait_queue_head_t	wait_unfrozen;	/* queue for waiting for
-						   sb to be thawed */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	lock_map[SB_FREEZE_LEVELS];
-#endif
+	int				frozen;		/* Is sb frozen? */
+	wait_queue_head_t		wait_unfrozen;	/* for get_super_thawed() */
+	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
 };
 
 struct super_block {
@@ -1393,9 +1386,9 @@ void __sb_end_write(struct super_block *sb, int level);
 int __sb_start_write(struct super_block *sb, int level, bool wait);
 
 #define __sb_writers_acquired(sb, lev)	\
-	rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_)
+	percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
 #define __sb_writers_release(sb, lev)	\
-	rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_)
+	percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
 
 /**
  * sb_end_write - drop write access to a superblock
-- 
cgit 


From cbedaac63481dea52327127a9f1c60f092bd6b07 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 12 Mar 2015 08:19:11 -0400
Subject: inode: add hlist_fake to avoid the inode hash lock in evict

Some filesystems don't use the VFS inode hash and fake the fact they
are hashed so that all the writeback code works correctly. However,
this means the evict() path still tries to remove the inode from the
hash, meaning that the inode_hash_lock() needs to be taken
unnecessarily. Hence under certain workloads the inode_hash_lock can
be contended even if the inode is never actually hashed.

To avoid this add hlist_fake to test if the inode isn't actually
hashed to avoid taking the hash lock on inodes that have never been
hashed.  Based on Dave Chinner's

inode: add IOP_NOTHASHED to avoid inode hash lock in evict

basd on Al's suggestions.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>
---
 include/linux/fs.h   | 2 +-
 include/linux/list.h | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 84b783f277f7..4a40fa843040 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2608,7 +2608,7 @@ static inline void insert_inode_hash(struct inode *inode)
 extern void __remove_inode_hash(struct inode *);
 static inline void remove_inode_hash(struct inode *inode)
 {
-	if (!inode_unhashed(inode))
+	if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
 		__remove_inode_hash(inode);
 }
 
diff --git a/include/linux/list.h b/include/linux/list.h
index feb773c76ee0..3e3e64a61002 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -672,6 +672,11 @@ static inline void hlist_add_fake(struct hlist_node *n)
 	n->pprev = &n->next;
 }
 
+static inline bool hlist_fake(struct hlist_node *h)
+{
+	return h->pprev == &h->next;
+}
+
 /*
  * Move a list from one list head to another. Fixup the pprev
  * reference of the first entry if it exists.
-- 
cgit 


From 74278da9f70d84d715601fe794567a6d2bfdf078 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 4 Mar 2015 12:37:22 -0500
Subject: inode: convert inode_sb_list_lock to per-sb

The process of reducing contention on per-superblock inode lists
starts with moving the locking to match the per-superblock inode
list. This takes the global lock out of the picture and reduces the
contention problems to within a single filesystem. This doesn't get
rid of contention as the locks still have global CPU scope, but it
does isolate operations on different superblocks form each other.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>
---
 include/linux/fs.h               | 5 ++++-
 include/linux/fsnotify_backend.h | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a40fa843040..09bbd38485f9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1309,7 +1309,6 @@ struct super_block {
 #endif
 	const struct xattr_handler **s_xattr;
 
-	struct list_head	s_inodes;	/* all inodes */
 	struct hlist_bl_head	s_anon;		/* anonymous dentries for (nfs) exporting */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
 	struct block_device	*s_bdev;
@@ -1380,6 +1379,10 @@ struct super_block {
 	 * Indicates how deep in a filesystem stack this SB is
 	 */
 	int s_stack_depth;
+
+	/* s_inode_list_lock protects s_inodes */
+	spinlock_t		s_inode_list_lock ____cacheline_aligned_in_smp;
+	struct list_head	s_inodes;	/* all inodes */
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 65a517dd32f7..0390ee69c439 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -357,7 +357,7 @@ extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, un
 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
-extern void fsnotify_unmount_inodes(struct list_head *list);
+extern void fsnotify_unmount_inodes(struct super_block *sb);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern void fsnotify_init_event(struct fsnotify_event *event,
@@ -393,7 +393,7 @@ static inline u32 fsnotify_get_cookie(void)
 	return 0;
 }
 
-static inline void fsnotify_unmount_inodes(struct list_head *list)
+static inline void fsnotify_unmount_inodes(struct super_block *sb)
 {}
 
 #endif	/* CONFIG_FSNOTIFY */
-- 
cgit 


From e97fedb9ef9868ff24d588be781906cf7c1b59ae Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 4 Mar 2015 13:40:00 -0500
Subject: sync: serialise per-superblock sync operations

When competing sync(2) calls walk the same filesystem, they need to
walk the list of inodes on the superblock to find all the inodes
that we need to wait for IO completion on. However, when multiple
wait_sb_inodes() calls do this at the same time, they contend on the
the inode_sb_list_lock and the contention causes system wide
slowdowns. In effect, concurrent sync(2) calls can take longer and
burn more CPU than if they were serialised.

Stop the worst of the contention by adding a per-sb mutex to wrap
around wait_sb_inodes() so that we only execute one sync(2) IO
completion walk per superblock superblock at a time and hence avoid
contention being triggered by concurrent sync(2) calls.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>
---
 include/linux/fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 09bbd38485f9..82dfc5519b4b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1375,6 +1375,8 @@ struct super_block {
 	struct list_lru		s_inode_lru ____cacheline_aligned_in_smp;
 	struct rcu_head		rcu;
 
+	struct mutex		s_sync_lock;	/* sync serialisation lock */
+
 	/*
 	 * Indicates how deep in a filesystem stack this SB is
 	 */
-- 
cgit 


From c7f5408493aeb01532927b2276316797a03ed6ee Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 4 Mar 2015 14:07:22 -0500
Subject: inode: rename i_wb_list to i_io_list

There's a small consistency problem between the inode and writeback
naming. Writeback calls the "for IO" inode queues b_io and
b_more_io, but the inode calls these the "writeback list" or
i_wb_list. This makes it hard to an new "under writeback" list to
the inode, or call it an "under IO" list on the bdi because either
way we'll have writeback on IO and IO on writeback and it'll just be
confusing. I'm getting confused just writing this!

So, rename the inode "for IO" list variable to i_io_list so we can
add a new "writeback list" in a subsequent patch.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 82dfc5519b4b..34cfa60db678 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -636,7 +636,7 @@ struct inode {
 	unsigned long		dirtied_time_when;
 
 	struct hlist_node	i_hash;
-	struct list_head	i_wb_list;	/* backing dev IO list */
+	struct list_head	i_io_list;	/* backing dev IO list */
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
 
-- 
cgit