From 13db16d74c02c0e9b2d0dc9b9773c8e58ad41064 Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Mon, 23 Apr 2018 17:37:30 +0800
Subject: md: fix an error code format and remove unsed bio_sector

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c  | 2 --
 drivers/md/raid10.c | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'drivers/md')
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e9e3308cb0a7..397b3d80c48b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2450,7 +2450,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	struct mddev *mddev = conf->mddev;
 	struct bio *bio;
 	struct md_rdev *rdev;
-	sector_t bio_sector;
 
 	clear_bit(R1BIO_ReadError, &r1_bio->state);
 	/* we got a read error. Maybe the drive is bad.  Maybe just
@@ -2463,7 +2462,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	 */
 
 	bio = r1_bio->bios[r1_bio->read_disk];
-	bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
 	bio_put(bio);
 	r1_bio->bios[r1_bio->read_disk] = NULL;
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3c60774c8430..64300542ab20 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2363,7 +2363,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 {
 	int sect = 0; /* Offset from r10_bio->sector */
 	int sectors = r10_bio->sectors;
-	struct md_rdev*rdev;
+	struct md_rdev *rdev;
 	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
 	int d = r10_bio->devs[r10_bio->read_slot].devnum;
 
-- 
cgit 


From eb81b328267b2d97d11441483f5ac9dccb505818 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 26 Apr 2018 10:56:37 +0800
Subject: raid10: check bio in r10buf_pool_free to void NULL pointer
 dereference

For recovery case, r10buf_pool_alloc only allocates 2 bios,
so we can't access more than 2 bios in r10buf_pool_free.
Otherwise, we can see NULL pointer dereference as follows:

[   98.347009] BUG: unable to handle kernel NULL pointer dereference
at 0000000000000050
[   98.355783] IP: r10buf_pool_free+0x38/0xe0 [raid10]
[...]
[   98.543734] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   98.550161] CR2: 0000000000000050 CR3: 000000089500a001 CR4: 00000000001606f0
[   98.558145] Call Trace:
[   98.560881]  <IRQ>
[   98.563136]  put_buf+0x19/0x20 [raid10]
[   98.567426]  end_sync_request+0x6b/0x70 [raid10]
[   98.572591]  end_sync_write+0x9b/0x160 [raid10]
[   98.577662]  blk_update_request+0x78/0x2c0
[   98.582254]  scsi_end_request+0x2c/0x1e0 [scsi_mod]
[   98.587719]  scsi_io_completion+0x22f/0x610 [scsi_mod]
[   98.593472]  blk_done_softirq+0x8e/0xc0
[   98.597767]  __do_softirq+0xde/0x2b3
[   98.601770]  irq_exit+0xae/0xb0
[   98.605285]  do_IRQ+0x81/0xd0
[   98.608606]  common_interrupt+0x7d/0x7d
[   98.612898]  </IRQ>

So we need to check the bio is valid or not before the bio is
used in r10buf_pool_free. Another workable way is to free 2 bios
for recovery case just like r10buf_pool_alloc.

Fixes: f0250618361d ("md: raid10: don't use bio's vec table to manage resync pages")
Reported-by: Alexis Castilla <pencerval@gmail.com>
Tested-by: Alexis Castilla <pencerval@gmail.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64300542ab20..976ef8cae64c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -255,9 +255,11 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
 	for (j = conf->copies; j--; ) {
 		struct bio *bio = r10bio->devs[j].bio;
 
-		rp = get_resync_pages(bio);
-		resync_free_pages(rp);
-		bio_put(bio);
+		if (bio) {
+			rp = get_resync_pages(bio);
+			resync_free_pages(rp);
+			bio_put(bio);
+		}
 
 		bio = r10bio->devs[j].repl_bio;
 		if (bio)
-- 
cgit 


From 011abdc9df559ec75779bb7c53a744c69b2a94c6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 26 Apr 2018 14:46:29 +1000
Subject: md: fix two problems with setting the "re-add" device state.

If "re-add" is written to the "state" file for a device
which is faulty, this has an effect similar to removing
and re-adding the device.  It should take up the
same slot in the array that it previously had, and
an accelerated (e.g. bitmap-based) rebuild should happen.

The slot that "it previously had" is determined by
rdev->saved_raid_disk.
However this is not set when a device fails (only when a device
is added), and it is cleared when resync completes.
This means that "re-add" will normally work once, but may not work a
second time.

This patch includes two fixes.
1/ when a device fails, record the ->raid_disk value in
    ->saved_raid_disk before clearing ->raid_disk
2/ when "re-add" is written to a device for which
    ->saved_raid_disk is not set, fail.

I think this is suitable for stable as it can
cause re-adding a device to be forced to do a full
resync which takes a lot longer and so puts data at
more risk.

Cc: <stable@vger.kernel.org> (v4.1)
Fixes: 97f6cd39da22 ("md-cluster: re-add capabilities")
Signed-off-by: NeilBrown <neilb@suse.com>
Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c208c01f63a5..bac480d75d1d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2853,7 +2853,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 			err = 0;
 		}
 	} else if (cmd_match(buf, "re-add")) {
-		if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
+		if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
+			rdev->saved_raid_disk >= 0) {
 			/* clear_bit is performed _after_ all the devices
 			 * have their local Faulty bit cleared. If any writes
 			 * happen in the meantime in the local node, they
@@ -8641,6 +8642,7 @@ static int remove_and_add_spares(struct mddev *mddev,
 			if (mddev->pers->hot_remove_disk(
 				    mddev, rdev) == 0) {
 				sysfs_unlink_rdev(mddev, rdev);
+				rdev->saved_raid_disk = rdev->raid_disk;
 				rdev->raid_disk = -1;
 				removed++;
 			}
-- 
cgit 


From 2cd259a77de561d49c1ff939a239095749e65ee7 Mon Sep 17 00:00:00 2001
From: Mariusz Dabrowski <mariusz.dabrowski@intel.com>
Date: Thu, 19 Apr 2018 19:28:10 +0200
Subject: raid5: copy write hint from origin bio to stripe

Store write hint from original bio in stripe head so it can be assigned
to bio sent to each RAID device.

Signed-off-by: Mariusz Dabrowski <mariusz.dabrowski@intel.com>
Reviewed-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Reviewed-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 6 ++++++
 drivers/md/raid5.h | 1 +
 2 files changed, 7 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index be117d0a65a8..1e1515c817ff 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1139,6 +1139,9 @@ again:
 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			bi->bi_io_vec[0].bv_offset = 0;
 			bi->bi_iter.bi_size = STRIPE_SIZE;
+			bi->bi_write_hint = sh->dev[i].write_hint;
+			if (!rrdev)
+				sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
 			/*
 			 * If this is discard request, set bi_vcnt 0. We don't
 			 * want to confuse SCSI because SCSI will replace payload
@@ -1190,6 +1193,8 @@ again:
 			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			rbi->bi_io_vec[0].bv_offset = 0;
 			rbi->bi_iter.bi_size = STRIPE_SIZE;
+			rbi->bi_write_hint = sh->dev[i].write_hint;
+			sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
 			/*
 			 * If this is discard request, set bi_vcnt 0. We don't
 			 * want to confuse SCSI because SCSI will replace payload
@@ -3204,6 +3209,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		(unsigned long long)sh->sector);
 
 	spin_lock_irq(&sh->stripe_lock);
+	sh->dev[dd_idx].write_hint = bi->bi_write_hint;
 	/* Don't allow new IO added to stripes in batch list */
 	if (sh->batch_head)
 		goto overlap;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3f8da26032ac..aea2447b0ea7 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -257,6 +257,7 @@ struct stripe_head {
 		sector_t	sector;			/* sector of this page */
 		unsigned long	flags;
 		u32		log_checksum;
+		unsigned short	write_hint;
 	} dev[1]; /* allocated with extra space depending of RAID geometry */
 };
 
-- 
cgit 


From c42a0e2675721e1444f56e6132a07b7b1ec169ac Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Fri, 4 May 2018 18:08:10 +0800
Subject: md: fix NULL dereference of mddev->pers in remove_and_add_spares()

We met NULL pointer BUG as follow:

[  151.760358] BUG: unable to handle kernel NULL pointer dereference at 0000000000000060
[  151.761340] PGD 80000001011eb067 P4D 80000001011eb067 PUD 1011ea067 PMD 0
[  151.762039] Oops: 0000 [#1] SMP PTI
[  151.762406] Modules linked in:
[  151.762723] CPU: 2 PID: 3561 Comm: mdadm-test Kdump: loaded Not tainted 4.17.0-rc1+ #238
[  151.763542] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1.fc26 04/01/2014
[  151.764432] RIP: 0010:remove_and_add_spares.part.56+0x13c/0x3a0
[  151.765061] RSP: 0018:ffffc90001d7fcd8 EFLAGS: 00010246
[  151.765590] RAX: 0000000000000000 RBX: ffff88013601d600 RCX: 0000000000000000
[  151.766306] RDX: 0000000000000000 RSI: ffff88013601d600 RDI: ffff880136187000
[  151.767014] RBP: ffff880136187018 R08: 0000000000000003 R09: 0000000000000051
[  151.767728] R10: ffffc90001d7fed8 R11: 0000000000000000 R12: ffff88013601d600
[  151.768447] R13: ffff8801298b1300 R14: ffff880136187000 R15: 0000000000000000
[  151.769160] FS:  00007f2624276700(0000) GS:ffff88013ae80000(0000) knlGS:0000000000000000
[  151.769971] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  151.770554] CR2: 0000000000000060 CR3: 0000000111aac000 CR4: 00000000000006e0
[  151.771272] Call Trace:
[  151.771542]  md_ioctl+0x1df2/0x1e10
[  151.771906]  ? __switch_to+0x129/0x440
[  151.772295]  ? __schedule+0x244/0x850
[  151.772672]  blkdev_ioctl+0x4bd/0x970
[  151.773048]  block_ioctl+0x39/0x40
[  151.773402]  do_vfs_ioctl+0xa4/0x610
[  151.773770]  ? dput.part.23+0x87/0x100
[  151.774151]  ksys_ioctl+0x70/0x80
[  151.774493]  __x64_sys_ioctl+0x16/0x20
[  151.774877]  do_syscall_64+0x5b/0x180
[  151.775258]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

For raid6, when two disk of the array are offline, two spare disks can
be added into the array. Before spare disks recovery completing,
system reboot and mdadm thinks it is ok to restart the degraded
array by md_ioctl(). Since disks in raid6 is not only_parity(),
raid5_run() will abort, when there is no PPL feature or not setting
'start_dirty_degraded' parameter. Therefore, mddev->pers is NULL.

But, mddev->raid_disks has been set and it will not be cleared when
raid5_run abort. md_ioctl() can execute cmd 'HOT_REMOVE_DISK' to
remove a disk by mdadm, which will cause NULL pointer dereference
in remove_and_add_spares() finally.

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index bac480d75d1d..df9eb1a04f26 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6525,6 +6525,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 	char b[BDEVNAME_SIZE];
 	struct md_rdev *rdev;
 
+	if (!mddev->pers)
+		return -ENODEV;
+
 	rdev = find_rdev(mddev, dev);
 	if (!rdev)
 		return -ENXIO;
-- 
cgit 


From b33d10624fdc15cdf1495f3f00481afccec76783 Mon Sep 17 00:00:00 2001
From: Gioh Kim <gi-oh.kim@profitbricks.com>
Date: Wed, 2 May 2018 13:08:11 +0200
Subject: md/raid1: add error handling of read error from FailFast device

Current handle_read_error() function calls fix_read_error()
only if md device is RW and rdev does not include FailFast flag.
It does not handle a read error from a RW device including
FailFast flag.

I am not sure it is intended. But I found that write IO error
sets rdev faulty. The md module should handle the read IO error and
write IO error equally. So I think read IO error should set rdev faulty.

Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Reviewed-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 397b3d80c48b..28ad97eeb6ab 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2472,6 +2472,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 		fix_read_error(conf, r1_bio->read_disk,
 			       r1_bio->sector, r1_bio->sectors);
 		unfreeze_array(conf);
+	} else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
+		md_error(mddev, rdev);
 	} else {
 		r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
 	}
-- 
cgit 


From 448ec638c6bcf36923db695b6188373b933e7ddc Mon Sep 17 00:00:00 2001
From: Amy Chiang <amychiang@qnap.com>
Date: Wed, 16 May 2018 18:59:35 +0800
Subject: md/raid5: Assigning NULL to sh->batch_head before testing bit
 R5_Overlap of a stripe

In add_stripe_bio(), if the stripe_head is in batch list, the incoming
bio is regarded as overlapping, and the bit R5_Overlap on this stripe_head
is set. break_stripe_batch_list() checks bit R5_Overlap on each stripe_head
first then assigns NULL to sh->batch_head.

If break_stripe_batch_list() checks bit R5_Overlap on stripe_head A
after add_stripe_bio() finds stripe_head A is in batch list and before
add_stripe_bio() sets bit R5_Overlapt of stripe_head A,
break_stripe_batch_list() would not know there's a process in
wait_for_overlap and needs to call wake_up(). There's a huge chance a
process never returns from schedule() if add_stripe_bio() is called
from raid5_make_request().

In break_stripe_batch_list(), assigning NULL to sh->batch_head should
be done before it checks bit R5_Overlap of a stripe_head.

Signed-off-by: Amy Chiang <amychiang@qnap.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1e1515c817ff..d034a20a12be 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4620,15 +4620,15 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
 
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
+		spin_lock_irq(&sh->stripe_lock);
+		sh->batch_head = NULL;
+		spin_unlock_irq(&sh->stripe_lock);
 		for (i = 0; i < sh->disks; i++) {
 			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 				do_wakeup = 1;
 			sh->dev[i].flags = head_sh->dev[i].flags &
 				(~((1 << R5_WriteError) | (1 << R5_Overlap)));
 		}
-		spin_lock_irq(&sh->stripe_lock);
-		sh->batch_head = NULL;
-		spin_unlock_irq(&sh->stripe_lock);
 		if (handle_flags == 0 ||
 		    sh->state & handle_flags)
 			set_bit(STRIPE_HANDLE, &sh->state);
-- 
cgit 


From 5a409b4f56d50b212334f338cb8465d65550cd85 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Mon, 21 May 2018 11:49:54 +0800
Subject: MD: fix lock contention for flush bios

There is a lock contention when there are many processes which send flush bios
to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv.

Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio
to be NULL, otherwise get mddev->lock.

This patch remove mddev->flush_bio and handle flush bio asynchronously.
I did a test with command dbench -s 128 -t 300. This is the test result:

=================Without the patch============================
 Operation                Count    AvgLat    MaxLat
 --------------------------------------------------
 Flush                    11165   167.595  5879.560
 Close                   107469     1.391  2231.094
 LockX                      384     0.003     0.019
 Rename                    5944     2.141  1856.001
 ReadX                   208121     0.003     0.074
 WriteX                   98259  1925.402 15204.895
 Unlink                   25198    13.264  3457.268
 UnlockX                    384     0.001     0.009
 FIND_FIRST               47111     0.012     0.076
 SET_FILE_INFORMATION     12966     0.007     0.065
 QUERY_FILE_INFORMATION   27921     0.004     0.085
 QUERY_PATH_INFORMATION  124650     0.005     5.766
 QUERY_FS_INFORMATION     22519     0.003     0.053
 NTCreateX               141086     4.291  2502.812

Throughput 3.7181 MB/sec (sync open)  128 clients  128 procs  max_latency=15204.905 ms

=================With the patch============================
 Operation                Count    AvgLat    MaxLat
 --------------------------------------------------
 Flush                     4500   174.134   406.398
 Close                    48195     0.060   467.062
 LockX                      256     0.003     0.029
 Rename                    2324     0.026     0.360
 ReadX                    78846     0.004     0.504
 WriteX                   66832   562.775  1467.037
 Unlink                    5516     3.665  1141.740
 UnlockX                    256     0.002     0.019
 FIND_FIRST               16428     0.015     0.313
 SET_FILE_INFORMATION      6400     0.009     0.520
 QUERY_FILE_INFORMATION   17865     0.003     0.089
 QUERY_PATH_INFORMATION   47060     0.078   416.299
 QUERY_FS_INFORMATION      7024     0.004     0.032
 NTCreateX                55921     0.854  1141.452

Throughput 11.744 MB/sec (sync open)  128 clients  128 procs  max_latency=1467.041 ms

The test is done on raid1 disk with two rotational disks

V5: V4 is more complicated than the version with memory pool. So revert to the memory pool
version

V4: use address of fbio to do hash to choose free flush info.
V3:
Shaohua suggests mempool is overkill. In v3 it allocs memory during creating raid device
and uses a simple bitmap to record which resource is free.

Fix a bug from v2. It should set flush_pending to 1 at first.

V2:
Neil pointed out two problems. One is counting error problem and another is return value
when allocat memory fails.
1. counting error problem
This isn't safe.  It is only safe to call rdev_dec_pending() on rdevs
that you previously called
                          atomic_inc(&rdev->nr_pending);
If an rdev was added to the list between the start and end of the flush,
this will do something bad.

Now it doesn't use bio_chain. It uses specified call back function for each
flush bio.
2. Returned on IO error when kmalloc fails is wrong.
I use mempool suggested by Neil in V2
3. Fixed some places pointed by Guoqing

Suggested-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 158 +++++++++++++++++++++++++++++++++++++-------------------
 drivers/md/md.h |  22 +++++---
 2 files changed, 120 insertions(+), 60 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index df9eb1a04f26..6b4e2f29fe4e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -130,6 +130,24 @@ static inline int speed_max(struct mddev *mddev)
 		mddev->sync_speed_max : sysctl_speed_limit_max;
 }
 
+static void * flush_info_alloc(gfp_t gfp_flags, void *data)
+{
+        return kzalloc(sizeof(struct flush_info), gfp_flags);
+}
+static void flush_info_free(void *flush_info, void *data)
+{
+        kfree(flush_info);
+}
+
+static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
+{
+	return kzalloc(sizeof(struct flush_bio), gfp_flags);
+}
+static void flush_bio_free(void *flush_bio, void *data)
+{
+	kfree(flush_bio);
+}
+
 static struct ctl_table_header *raid_table_header;
 
 static struct ctl_table raid_table[] = {
@@ -412,30 +430,53 @@ static int md_congested(void *data, int bits)
 /*
  * Generic flush handling for md
  */
+static void submit_flushes(struct work_struct *ws)
+{
+	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
+	struct mddev *mddev = fi->mddev;
+	struct bio *bio = fi->bio;
+
+	bio->bi_opf &= ~REQ_PREFLUSH;
+	md_handle_request(mddev, bio);
+
+	mempool_free(fi, mddev->flush_pool);
+}
 
-static void md_end_flush(struct bio *bio)
+static void md_end_flush(struct bio *fbio)
 {
-	struct md_rdev *rdev = bio->bi_private;
-	struct mddev *mddev = rdev->mddev;
+	struct flush_bio *fb = fbio->bi_private;
+	struct md_rdev *rdev = fb->rdev;
+	struct flush_info *fi = fb->fi;
+	struct bio *bio = fi->bio;
+	struct mddev *mddev = fi->mddev;
 
 	rdev_dec_pending(rdev, mddev);
 
-	if (atomic_dec_and_test(&mddev->flush_pending)) {
-		/* The pre-request flush has finished */
-		queue_work(md_wq, &mddev->flush_work);
+	if (atomic_dec_and_test(&fi->flush_pending)) {
+		if (bio->bi_iter.bi_size == 0)
+			/* an empty barrier - all done */
+			bio_endio(bio);
+		else {
+			INIT_WORK(&fi->flush_work, submit_flushes);
+			queue_work(md_wq, &fi->flush_work);
+		}
 	}
-	bio_put(bio);
-}
 
-static void md_submit_flush_data(struct work_struct *ws);
+	mempool_free(fb, mddev->flush_bio_pool);
+	bio_put(fbio);
+}
 
-static void submit_flushes(struct work_struct *ws)
+void md_flush_request(struct mddev *mddev, struct bio *bio)
 {
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 	struct md_rdev *rdev;
+	struct flush_info *fi;
+
+	fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
+
+	fi->bio = bio;
+	fi->mddev = mddev;
+	atomic_set(&fi->flush_pending, 1);
 
-	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
-	atomic_set(&mddev->flush_pending, 1);
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev)
 		if (rdev->raid_disk >= 0 &&
@@ -445,59 +486,39 @@ static void submit_flushes(struct work_struct *ws)
 			 * we reclaim rcu_read_lock
 			 */
 			struct bio *bi;
+			struct flush_bio *fb;
 			atomic_inc(&rdev->nr_pending);
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
+
+			fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
+			fb->fi = fi;
+			fb->rdev = rdev;
+
 			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
-			bi->bi_end_io = md_end_flush;
-			bi->bi_private = rdev;
 			bio_set_dev(bi, rdev->bdev);
+			bi->bi_end_io = md_end_flush;
+			bi->bi_private = fb;
 			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-			atomic_inc(&mddev->flush_pending);
+
+			atomic_inc(&fi->flush_pending);
 			submit_bio(bi);
+
 			rcu_read_lock();
 			rdev_dec_pending(rdev, mddev);
 		}
 	rcu_read_unlock();
-	if (atomic_dec_and_test(&mddev->flush_pending))
-		queue_work(md_wq, &mddev->flush_work);
-}
-
-static void md_submit_flush_data(struct work_struct *ws)
-{
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
-	struct bio *bio = mddev->flush_bio;
 
-	/*
-	 * must reset flush_bio before calling into md_handle_request to avoid a
-	 * deadlock, because other bios passed md_handle_request suspend check
-	 * could wait for this and below md_handle_request could wait for those
-	 * bios because of suspend check
-	 */
-	mddev->flush_bio = NULL;
-	wake_up(&mddev->sb_wait);
-
-	if (bio->bi_iter.bi_size == 0)
-		/* an empty barrier - all done */
-		bio_endio(bio);
-	else {
-		bio->bi_opf &= ~REQ_PREFLUSH;
-		md_handle_request(mddev, bio);
+	if (atomic_dec_and_test(&fi->flush_pending)) {
+		if (bio->bi_iter.bi_size == 0)
+			/* an empty barrier - all done */
+			bio_endio(bio);
+		else {
+			INIT_WORK(&fi->flush_work, submit_flushes);
+			queue_work(md_wq, &fi->flush_work);
+		}
 	}
 }
-
-void md_flush_request(struct mddev *mddev, struct bio *bio)
-{
-	spin_lock_irq(&mddev->lock);
-	wait_event_lock_irq(mddev->sb_wait,
-			    !mddev->flush_bio,
-			    mddev->lock);
-	mddev->flush_bio = bio;
-	spin_unlock_irq(&mddev->lock);
-
-	INIT_WORK(&mddev->flush_work, submit_flushes);
-	queue_work(md_wq, &mddev->flush_work);
-}
 EXPORT_SYMBOL(md_flush_request);
 
 static inline struct mddev *mddev_get(struct mddev *mddev)
@@ -555,7 +576,6 @@ void mddev_init(struct mddev *mddev)
 	atomic_set(&mddev->openers, 0);
 	atomic_set(&mddev->active_io, 0);
 	spin_lock_init(&mddev->lock);
-	atomic_set(&mddev->flush_pending, 0);
 	init_waitqueue_head(&mddev->sb_wait);
 	init_waitqueue_head(&mddev->recovery_wait);
 	mddev->reshape_position = MaxSector;
@@ -5510,6 +5530,22 @@ int md_run(struct mddev *mddev)
 			goto abort;
 		}
 	}
+	if (mddev->flush_pool == NULL) {
+		mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
+						flush_info_free, mddev);
+		if (!mddev->flush_pool) {
+			err = -ENOMEM;
+			goto abort;
+		}
+	}
+	if (mddev->flush_bio_pool == NULL) {
+		mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
+						flush_bio_free, mddev);
+		if (!mddev->flush_bio_pool) {
+			err = -ENOMEM;
+			goto abort;
+		}
+	}
 
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
@@ -5669,6 +5705,14 @@ int md_run(struct mddev *mddev)
 	return 0;
 
 abort:
+	if (mddev->flush_bio_pool) {
+		mempool_destroy(mddev->flush_bio_pool);
+		mddev->flush_bio_pool = NULL;
+	}
+	if (mddev->flush_pool){
+		mempool_destroy(mddev->flush_pool);
+		mddev->flush_pool = NULL;
+	}
 	if (mddev->bio_set) {
 		bioset_free(mddev->bio_set);
 		mddev->bio_set = NULL;
@@ -5889,6 +5933,14 @@ void md_stop(struct mddev *mddev)
 	 * This is called from dm-raid
 	 */
 	__md_stop(mddev);
+	if (mddev->flush_bio_pool) {
+		mempool_destroy(mddev->flush_bio_pool);
+		mddev->flush_bio_pool = NULL;
+	}
+	if (mddev->flush_pool) {
+		mempool_destroy(mddev->flush_pool);
+		mddev->flush_pool = NULL;
+	}
 	if (mddev->bio_set) {
 		bioset_free(mddev->bio_set);
 		mddev->bio_set = NULL;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fbc925cce810..ffcb1ae217fe 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -252,6 +252,19 @@ enum mddev_sb_flags {
 	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
 };
 
+#define NR_FLUSH_INFOS 8
+#define NR_FLUSH_BIOS 64
+struct flush_info {
+	struct bio			*bio;
+	struct mddev			*mddev;
+	struct work_struct		flush_work;
+	atomic_t			flush_pending;
+};
+struct flush_bio {
+	struct flush_info *fi;
+	struct md_rdev *rdev;
+};
+
 struct mddev {
 	void				*private;
 	struct md_personality		*pers;
@@ -457,13 +470,8 @@ struct mddev {
 						   * metadata and bitmap writes
 						   */
 
-	/* Generic flush handling.
-	 * The last to finish preflush schedules a worker to submit
-	 * the rest of the request (without the REQ_PREFLUSH flag).
-	 */
-	struct bio *flush_bio;
-	atomic_t flush_pending;
-	struct work_struct flush_work;
+	mempool_t			*flush_pool;
+	mempool_t			*flush_bio_pool;
 	struct work_struct event_work;	/* used by dm to report failure event */
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
 	struct md_cluster_info		*cluster_info;
-- 
cgit