From f4370781d83cd2e52eb515e4663155e8091e4d4e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Jul 2006 04:44:14 -0700
Subject: [PATCH] md: possible fix for unplug problem

I have reports of a problem with raid5 which turns out to be because the raid5
device gets stuck in a 'plugged' state.  This shouldn't be able to happen as
3msec after it gets plugged it should get unplugged.  However it happens
none-the-less.  This patch fixes the problem and is a reasonable thing to do,
though it might hurt performance slightly in some cases.

Until I can find the real problem, we should probably have this workaround in
place.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7433871f4b3a..5764387c6989 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -270,7 +270,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 						     < (conf->max_nr_stripes *3/4)
 						     || !conf->inactive_blocked),
 						    conf->device_lock,
-						    unplug_slaves(conf->mddev)
+						    raid5_unplug_device(conf->mddev->queue)
 					);
 				conf->inactive_blocked = 0;
 			} else
-- 
cgit 


From 0b8c9de05c2a860fe6b02fedcb48763bcee648b3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Jul 2006 04:44:15 -0700
Subject: [PATCH] md: delay starting md threads until array is completely setup

When an array is started we start one or two threads (two if there is a
reshape or recovery that needs to be completed).

We currently start these *before* the array is completely set up and in
particular before queue->queuedata is set.  If the thread actually starts
very quickly on another CPU, we can end up dereferencing queue->queuedata
and oops.

This patch also makes sure we don't try to start a recovery if a reshape is
being restarted.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5764387c6989..a02f35f1a796 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3246,9 +3246,6 @@ static int run(mddev_t *mddev)
 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
 							"%s_reshape");
-		/* FIXME if md_register_thread fails?? */
-		md_wakeup_thread(mddev->sync_thread);
-
 	}
 
 	/* read-ahead size must cover two whole stripes, which is
-- 
cgit 


From ff4e8d9a9f46e3a7f89d14ade52fe5d53a82c022 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Jul 2006 04:44:16 -0700
Subject: [PATCH] md: fix resync speed calculation for restarted resyncs

We introduced 'io_sectors' recently so we could count the sectors that causes
io during resync separate from sectors which didn't cause IO - there can be a
difference if a bitmap is being used to accelerate resync.

However when a speed is reported, we find the number of sectors processed
recently by subtracting an oldish io_sectors count from a current
'curr_resync' count.  This is wrong because curr_resync counts all sectors,
not just io sectors.

So, add a field to mddev to store the curren io_sectors separately from
curr_resync, and use that in the calculations.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a02f35f1a796..dd0d00108a31 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -281,7 +281,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 			} else {
 				if (!test_bit(STRIPE_HANDLE, &sh->state))
 					atomic_inc(&conf->active_stripes);
-				if (list_empty(&sh->lru))
+				if (list_empty(&sh->lru) &&
+				    !test_bit(STRIPE_EXPANDING, &sh->state))
 					BUG();
 				list_del_init(&sh->lru);
 			}
-- 
cgit 


From 7c785b7a18dc30572a49c6b75efd384269735d14 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Jul 2006 04:44:16 -0700
Subject: [PATCH] md: fix a plug/unplug race in raid5

When a device is unplugged, requests are moved from one or two (depending on
whether a bitmap is in use) queues to the main request queue.

So whenever requests are put on either of those queues, we should make sure
the raid5 array is 'plugged'.  However we don't.  We currently plug the raid5
queue just before putting requests on queues, so there is room for a race.  If
something unplugs the queue at just the wrong time, requests will be left on
the queue and nothing will want to unplug them.  Normally something else will
plug and unplug the queue fairly soon, but there is a risk that nothing will.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dd0d00108a31..6ba394082129 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -88,12 +88,14 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 		BUG_ON(!list_empty(&sh->lru));
 		BUG_ON(atomic_read(&conf->active_stripes)==0);
 		if (test_bit(STRIPE_HANDLE, &sh->state)) {
-			if (test_bit(STRIPE_DELAYED, &sh->state))
+			if (test_bit(STRIPE_DELAYED, &sh->state)) {
 				list_add_tail(&sh->lru, &conf->delayed_list);
-			else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-				 conf->seq_write == sh->bm_seq)
+				blk_plug_device(conf->mddev->queue);
+			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+				   conf->seq_write == sh->bm_seq) {
 				list_add_tail(&sh->lru, &conf->bitmap_list);
-			else {
+				blk_plug_device(conf->mddev->queue);
+			} else {
 				clear_bit(STRIPE_BIT_DELAY, &sh->state);
 				list_add_tail(&sh->lru, &conf->handle_list);
 			}
@@ -2555,13 +2557,6 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
 	return ret;
 }
 
-static inline void raid5_plug_device(raid5_conf_t *conf)
-{
-	spin_lock_irq(&conf->device_lock);
-	blk_plug_device(conf->mddev->queue);
-	spin_unlock_irq(&conf->device_lock);
-}
-
 static int make_request(request_queue_t *q, struct bio * bi)
 {
 	mddev_t *mddev = q->queuedata;
@@ -2671,7 +2666,6 @@ static int make_request(request_queue_t *q, struct bio * bi)
 				goto retry;
 			}
 			finish_wait(&conf->wait_for_overlap, &w);
-			raid5_plug_device(conf);
 			handle_stripe(sh, NULL);
 			release_stripe(sh);
 		} else {
-- 
cgit 


From ae3c20ccf84c88d45616f12122f781a900118f09 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Jul 2006 04:44:17 -0700
Subject: [PATCH] md: fix some small races in bitmap plugging in raid5

The comment gives more details, but I didn't quite have the sequencing write,
so there was room for races to leave bits unset in the on-disk bitmap for
short periods of time.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6ba394082129..56303ff31730 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -18,6 +18,30 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+/*
+ * BITMAP UNPLUGGING:
+ *
+ * The sequencing for updating the bitmap reliably is a little
+ * subtle (and I got it wrong the first time) so it deserves some
+ * explanation.
+ *
+ * We group bitmap updates into batches.  Each batch has a number.
+ * We may write out several batches at once, but that isn't very important.
+ * conf->bm_write is the number of the last batch successfully written.
+ * conf->bm_flush is the number of the last batch that was closed to
+ *    new additions.
+ * When we discover that we will need to write to any block in a stripe
+ * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
+ * the number of the batch it will be in. This is bm_flush+1.
+ * When we are ready to do a write, if that batch hasn't been written yet,
+ *   we plug the array and queue the stripe for later.
+ * When an unplug happens, we increment bm_flush, thus closing the current
+ *   batch.
+ * When we notice that bm_flush > bm_write, we write out all pending updates
+ * to the bitmap, and advance bm_write to where bm_flush was.
+ * This may occasionally write a bit out twice, but is sure never to
+ * miss any bits.
+ */
 
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -92,7 +116,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 				list_add_tail(&sh->lru, &conf->delayed_list);
 				blk_plug_device(conf->mddev->queue);
 			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-				   conf->seq_write == sh->bm_seq) {
+				   sh->bm_seq - conf->seq_write > 0) {
 				list_add_tail(&sh->lru, &conf->bitmap_list);
 				blk_plug_device(conf->mddev->queue);
 			} else {
@@ -1273,9 +1297,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 		(unsigned long long)sh->sector, dd_idx);
 
 	if (conf->mddev->bitmap && firstwrite) {
-		sh->bm_seq = conf->seq_write;
 		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
 				  STRIPE_SECTORS, 0);
+		sh->bm_seq = conf->seq_flush+1;
 		set_bit(STRIPE_BIT_DELAY, &sh->state);
 	}
 
@@ -2918,7 +2942,7 @@ static void raid5d (mddev_t *mddev)
 	while (1) {
 		struct list_head *first;
 
-		if (conf->seq_flush - conf->seq_write > 0) {
+		if (conf->seq_flush != conf->seq_write) {
 			int seq = conf->seq_flush;
 			spin_unlock_irq(&conf->device_lock);
 			bitmap_unplug(mddev->bitmap);
-- 
cgit 


From d69504325978c461b51b03cca49626026970307b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Jul 2006 04:44:20 -0700
Subject: [PATCH] md: include sector number in messages about corrected read
 errors

This is generally useful, but particularly helps see if it is the same sector
that always needs correcting, or different ones.

[akpm@osdl.org: fix printk warnings]
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 56303ff31730..450066007160 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -523,6 +523,8 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks, i;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+	char b[BDEVNAME_SIZE];
+	mdk_rdev_t *rdev;
 
 	if (bi->bi_size)
 		return 1;
@@ -570,25 +572,39 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 #endif
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-			printk(KERN_INFO "raid5: read error corrected!!\n");
+			rdev = conf->disks[i].rdev;
+			printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
+			       mdname(conf->mddev), STRIPE_SECTORS,
+			       (unsigned long long)sh->sector + rdev->data_offset,
+			       bdevname(rdev->bdev, b));
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
 		}
 		if (atomic_read(&conf->disks[i].rdev->read_errors))
 			atomic_set(&conf->disks[i].rdev->read_errors, 0);
 	} else {
+		const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
 		int retry = 0;
+		rdev = conf->disks[i].rdev;
+
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-		atomic_inc(&conf->disks[i].rdev->read_errors);
+		atomic_inc(&rdev->read_errors);
 		if (conf->mddev->degraded)
-			printk(KERN_WARNING "raid5: read error not correctable.\n");
+			printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
+			       mdname(conf->mddev),
+			       (unsigned long long)sh->sector + rdev->data_offset,
+			       bdn);
 		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
 			/* Oh, no!!! */
-			printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
-		else if (atomic_read(&conf->disks[i].rdev->read_errors)
+			printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
+			       mdname(conf->mddev),
+			       (unsigned long long)sh->sector + rdev->data_offset,
+			       bdn);
+		else if (atomic_read(&rdev->read_errors)
 			 > conf->max_nr_stripes)
 			printk(KERN_WARNING
-			       "raid5: Too many read errors, failing device.\n");
+			       "raid5:%s: Too many read errors, failing device %s.\n",
+			       mdname(conf->mddev), bdn);
 		else
 			retry = 1;
 		if (retry)
@@ -596,7 +612,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
 		else {
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
-			md_error(conf->mddev, conf->disks[i].rdev);
+			md_error(conf->mddev, rdev);
 		}
 	}
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-- 
cgit