From f4370781d83cd2e52eb515e4663155e8091e4d4e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Jul 2006 04:44:14 -0700 Subject: [PATCH] md: possible fix for unplug problem I have reports of a problem with raid5 which turns out to be because the raid5 device gets stuck in a 'plugged' state. This shouldn't be able to happen as 3msec after it gets plugged it should get unplugged. However it happens none-the-less. This patch fixes the problem and is a reasonable thing to do, though it might hurt performance slightly in some cases. Until I can find the real problem, we should probably have this workaround in place. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7433871f4b3a..5764387c6989 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -270,7 +270,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector < (conf->max_nr_stripes *3/4) || !conf->inactive_blocked), conf->device_lock, - unplug_slaves(conf->mddev) + raid5_unplug_device(conf->mddev->queue) ); conf->inactive_blocked = 0; } else -- cgit From 0b8c9de05c2a860fe6b02fedcb48763bcee648b3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Jul 2006 04:44:15 -0700 Subject: [PATCH] md: delay starting md threads until array is completely setup When an array is started we start one or two threads (two if there is a reshape or recovery that needs to be completed). We currently start these *before* the array is completely set up and in particular before queue->queuedata is set. If the thread actually starts very quickly on another CPU, we can end up dereferencing queue->queuedata and oops. This patch also makes sure we don't try to start a recovery if a reshape is being restarted. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5764387c6989..a02f35f1a796 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3246,9 +3246,6 @@ static int run(mddev_t *mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, "%s_reshape"); - /* FIXME if md_register_thread fails?? */ - md_wakeup_thread(mddev->sync_thread); - } /* read-ahead size must cover two whole stripes, which is -- cgit From ff4e8d9a9f46e3a7f89d14ade52fe5d53a82c022 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Jul 2006 04:44:16 -0700 Subject: [PATCH] md: fix resync speed calculation for restarted resyncs We introduced 'io_sectors' recently so we could count the sectors that causes io during resync separate from sectors which didn't cause IO - there can be a difference if a bitmap is being used to accelerate resync. However when a speed is reported, we find the number of sectors processed recently by subtracting an oldish io_sectors count from a current 'curr_resync' count. This is wrong because curr_resync counts all sectors, not just io sectors. So, add a field to mddev to store the curren io_sectors separately from curr_resync, and use that in the calculations. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a02f35f1a796..dd0d00108a31 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -281,7 +281,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); - if (list_empty(&sh->lru)) + if (list_empty(&sh->lru) && + !test_bit(STRIPE_EXPANDING, &sh->state)) BUG(); list_del_init(&sh->lru); } -- cgit From 7c785b7a18dc30572a49c6b75efd384269735d14 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Jul 2006 04:44:16 -0700 Subject: [PATCH] md: fix a plug/unplug race in raid5 When a device is unplugged, requests are moved from one or two (depending on whether a bitmap is in use) queues to the main request queue. So whenever requests are put on either of those queues, we should make sure the raid5 array is 'plugged'. However we don't. We currently plug the raid5 queue just before putting requests on queues, so there is room for a race. If something unplugs the queue at just the wrong time, requests will be left on the queue and nothing will want to unplug them. Normally something else will plug and unplug the queue fairly soon, but there is a risk that nothing will. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dd0d00108a31..6ba394082129 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -88,12 +88,14 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state)) + if (test_bit(STRIPE_DELAYED, &sh->state)) { list_add_tail(&sh->lru, &conf->delayed_list); - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && - conf->seq_write == sh->bm_seq) + blk_plug_device(conf->mddev->queue); + } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + conf->seq_write == sh->bm_seq) { list_add_tail(&sh->lru, &conf->bitmap_list); - else { + blk_plug_device(conf->mddev->queue); + } else { clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); } @@ -2555,13 +2557,6 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } -static inline void raid5_plug_device(raid5_conf_t *conf) -{ - spin_lock_irq(&conf->device_lock); - blk_plug_device(conf->mddev->queue); - spin_unlock_irq(&conf->device_lock); -} - static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q->queuedata; @@ -2671,7 +2666,6 @@ static int make_request(request_queue_t *q, struct bio * bi) goto retry; } finish_wait(&conf->wait_for_overlap, &w); - raid5_plug_device(conf); handle_stripe(sh, NULL); release_stripe(sh); } else { -- cgit From ae3c20ccf84c88d45616f12122f781a900118f09 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Jul 2006 04:44:17 -0700 Subject: [PATCH] md: fix some small races in bitmap plugging in raid5 The comment gives more details, but I didn't quite have the sequencing write, so there was room for races to leave bits unset in the on-disk bitmap for short periods of time. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6ba394082129..56303ff31730 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -18,6 +18,30 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +/* + * BITMAP UNPLUGGING: + * + * The sequencing for updating the bitmap reliably is a little + * subtle (and I got it wrong the first time) so it deserves some + * explanation. + * + * We group bitmap updates into batches. Each batch has a number. + * We may write out several batches at once, but that isn't very important. + * conf->bm_write is the number of the last batch successfully written. + * conf->bm_flush is the number of the last batch that was closed to + * new additions. + * When we discover that we will need to write to any block in a stripe + * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq + * the number of the batch it will be in. This is bm_flush+1. + * When we are ready to do a write, if that batch hasn't been written yet, + * we plug the array and queue the stripe for later. + * When an unplug happens, we increment bm_flush, thus closing the current + * batch. + * When we notice that bm_flush > bm_write, we write out all pending updates + * to the bitmap, and advance bm_write to where bm_flush was. + * This may occasionally write a bit out twice, but is sure never to + * miss any bits. + */ #include #include @@ -92,7 +116,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) list_add_tail(&sh->lru, &conf->delayed_list); blk_plug_device(conf->mddev->queue); } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && - conf->seq_write == sh->bm_seq) { + sh->bm_seq - conf->seq_write > 0) { list_add_tail(&sh->lru, &conf->bitmap_list); blk_plug_device(conf->mddev->queue); } else { @@ -1273,9 +1297,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in (unsigned long long)sh->sector, dd_idx); if (conf->mddev->bitmap && firstwrite) { - sh->bm_seq = conf->seq_write; bitmap_startwrite(conf->mddev->bitmap, sh->sector, STRIPE_SECTORS, 0); + sh->bm_seq = conf->seq_flush+1; set_bit(STRIPE_BIT_DELAY, &sh->state); } @@ -2918,7 +2942,7 @@ static void raid5d (mddev_t *mddev) while (1) { struct list_head *first; - if (conf->seq_flush - conf->seq_write > 0) { + if (conf->seq_flush != conf->seq_write) { int seq = conf->seq_flush; spin_unlock_irq(&conf->device_lock); bitmap_unplug(mddev->bitmap); -- cgit From d69504325978c461b51b03cca49626026970307b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Jul 2006 04:44:20 -0700 Subject: [PATCH] md: include sector number in messages about corrected read errors This is generally useful, but particularly helps see if it is the same sector that always needs correcting, or different ones. [akpm@osdl.org: fix printk warnings] Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 56303ff31730..450066007160 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -523,6 +523,8 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + char b[BDEVNAME_SIZE]; + mdk_rdev_t *rdev; if (bi->bi_size) return 1; @@ -570,25 +572,39 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, set_bit(R5_UPTODATE, &sh->dev[i].flags); #endif if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - printk(KERN_INFO "raid5: read error corrected!!\n"); + rdev = conf->disks[i].rdev; + printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)sh->sector + rdev->data_offset, + bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } if (atomic_read(&conf->disks[i].rdev->read_errors)) atomic_set(&conf->disks[i].rdev->read_errors, 0); } else { + const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); int retry = 0; + rdev = conf->disks[i].rdev; + clear_bit(R5_UPTODATE, &sh->dev[i].flags); - atomic_inc(&conf->disks[i].rdev->read_errors); + atomic_inc(&rdev->read_errors); if (conf->mddev->degraded) - printk(KERN_WARNING "raid5: read error not correctable.\n"); + printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)sh->sector + rdev->data_offset, + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk(KERN_WARNING "raid5: read error NOT corrected!!\n"); - else if (atomic_read(&conf->disks[i].rdev->read_errors) + printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)sh->sector + rdev->data_offset, + bdn); + else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING - "raid5: Too many read errors, failing device.\n"); + "raid5:%s: Too many read errors, failing device %s.\n", + mdname(conf->mddev), bdn); else retry = 1; if (retry) @@ -596,7 +612,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - md_error(conf->mddev, conf->disks[i].rdev); + md_error(conf->mddev, rdev); } } rdev_dec_pending(conf->disks[i].rdev, conf->mddev); -- cgit