From 8f6c2e4b325a8e9f8f47febb2fd0ed4fae7d45a9 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Jul 2009 11:13:45 +1000 Subject: md: Use new topology calls to indicate alignment and I/O sizes Switch MD over to the new disk_stack_limits() function which checks for aligment and adjusts preferred I/O sizes when stacking. Also indicate preferred I/O sizes where applicable. Signed-off-by: Martin K. Petersen Signed-off-by: Mike Snitzer Signed-off-by: NeilBrown --- drivers/md/raid5.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f9f991e6e138..92ef9b6abfc7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4452,7 +4452,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) static int run(mddev_t *mddev) { raid5_conf_t *conf; - int working_disks = 0; + int working_disks = 0, chunk_size; mdk_rdev_t *rdev; if (mddev->recovery_cp != MaxSector) @@ -4607,6 +4607,14 @@ static int run(mddev_t *mddev) md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks - conf->max_degraded)); + + list_for_each_entry(rdev, &mddev->disks, same_set) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); return 0; abort: -- cgit From a5c308d4d1659b1f4833b863394e3e24cdbdfc6e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 1 Jul 2009 13:15:35 +1000 Subject: md/raid5: suspend shouldn't affect read requests. md allows write to regions on an array to be suspended temporarily. This allows user-space to participate is aspects of reshape. In particular, data can be copied with not risk of a race. We should not be blocking read requests though, so don't. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 92ef9b6abfc7..1f444ae07f89 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3702,7 +3702,8 @@ static int make_request(struct request_queue *q, struct bio * bi) /* FIXME what if we get a false positive because these * are being updated. */ - if (logical_sector >= mddev->suspend_lo && + if (bio_data_dir(bi) == WRITE && + logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { release_stripe(sh); schedule(); -- cgit From e62e58a5ffdc98ac28d8dbd070c857620d541f99 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 1 Jul 2009 13:15:35 +1000 Subject: md: use interruptible wait when duration is controlled by userspace. User space can set various limits on an md array so that resync waits when it gets to a certain point, or so that I/O is blocked for a short while. When md is waiting against one of these limit, it should use an interruptible wait so as not to add to the load average, and so are not to trigger a warning if the wait goes on for too long. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1f444ae07f89..37835538b58e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3699,14 +3699,21 @@ static int make_request(struct request_queue *q, struct bio * bi) goto retry; } } - /* FIXME what if we get a false positive because these - * are being updated. - */ + if (bio_data_dir(bi) == WRITE && logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { release_stripe(sh); - schedule(); + /* As the suspend_* range is controlled by + * userspace, we want an interruptible + * wait. + */ + flush_signals(current); + prepare_to_wait(&conf->wait_for_overlap, + &w, TASK_INTERRUPTIBLE); + if (logical_sector >= mddev->suspend_lo && + logical_sector < mddev->suspend_hi) + schedule(); goto retry; } -- cgit From 95fc17aac45300f45968aacd97a536ddd8db8101 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Jul 2009 12:39:15 +1000 Subject: md/raid6: release spare page at ->stop() Add missing call to safe_put_page from stop() by unifying open coded raid5_conf_t de-allocation under free_conf(). Cc: Signed-off-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/raid5.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 37835538b58e..39374230a463 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4316,6 +4316,15 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) return sectors * (raid_disks - conf->max_degraded); } +static void free_conf(raid5_conf_t *conf) +{ + shrink_stripes(conf); + safe_put_page(conf->spare_page); + kfree(conf->disks); + kfree(conf->stripe_hashtbl); + kfree(conf); +} + static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; @@ -4447,11 +4456,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) abort: if (conf) { - shrink_stripes(conf); - safe_put_page(conf->spare_page); - kfree(conf->disks); - kfree(conf->stripe_hashtbl); - kfree(conf); + free_conf(conf); return ERR_PTR(-EIO); } else return ERR_PTR(-ENOMEM); @@ -4629,12 +4634,8 @@ abort: md_unregister_thread(mddev->thread); mddev->thread = NULL; if (conf) { - shrink_stripes(conf); print_raid5_conf(conf); - safe_put_page(conf->spare_page); - kfree(conf->disks); - kfree(conf->stripe_hashtbl); - kfree(conf); + free_conf(conf); } mddev->private = NULL; printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); @@ -4649,13 +4650,10 @@ static int stop(mddev_t *mddev) md_unregister_thread(mddev->thread); mddev->thread = NULL; - shrink_stripes(conf); - kfree(conf->stripe_hashtbl); mddev->queue->backing_dev_info.congested_fn = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); - kfree(conf->disks); - kfree(conf); + free_conf(conf); mddev->private = NULL; return 0; } -- cgit From e516402c0d4fc02be4af9fa8c18954d4f9deb44e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:57 +1000 Subject: md/raid5: set reshape_position correctly when reshape starts. As the internal reshape_progress counter is the main driver for reshape, the fact that reshape_position sometimes starts with the wrong value has minimal effect. It is visible in sysfs and that is all. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 39374230a463..659151e5eda4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5000,7 +5000,7 @@ static int raid5_start_reshape(mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); } mddev->raid_disks = conf->raid_disks; - mddev->reshape_position = 0; + mddev->reshape_position = conf->reshape_progress; set_bit(MD_CHANGE_DEVS, &mddev->flags); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); -- cgit From 64bd660b51b2da92e99a5e97349f6558349f11c5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:58 +1000 Subject: md: allow raid5_quiesce to work properly when reshape is happening. The ->quiesce method is not supposed to stop resync/recovery/reshape, just normal IO. But in raid5 we don't have a way to know which stripes are being used for normal IO and which for resync etc, so we need to wait for all stripes to be idle to be sure that all writes have completed. However reshape keeps at least some stripe busy for an extended period of time, so a call to raid5_quiesce can block for several seconds needlessly. So arrange for reshape etc to pause briefly while raid5_quiesce is trying to quiesce the array so that the active_stripes count can drop to zero. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 659151e5eda4..2dc35b4c20ac 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3999,6 +3999,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski return 0; } + /* Allow raid5_quiesce to complete */ + wait_event(conf->wait_for_overlap, conf->quiesce != 2); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); @@ -5104,12 +5107,18 @@ static void raid5_quiesce(mddev_t *mddev, int state) case 1: /* stop all writes */ spin_lock_irq(&conf->device_lock); - conf->quiesce = 1; + /* '2' tells resync/reshape to pause so that all + * active stripes can drain + */ + conf->quiesce = 2; wait_event_lock_irq(conf->wait_for_stripe, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, conf->device_lock, /* nothing */); + conf->quiesce = 1; spin_unlock_irq(&conf->device_lock); + /* allow reshape to continue */ + wake_up(&conf->wait_for_overlap); break; case 0: /* re-enable writes */ -- cgit From 449aad3e25358812c43afc60918c5ad3819488e7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:58 +1000 Subject: md: Use revalidate_disk to effect changes in size of device. As revalidate_disk calls check_disk_size_change, it will cause any capacity change of a gendisk to be propagated to the blockdev inode. So use that instead of mucking about with locks and i_size_write. Also add a call to revalidate_disk in do_md_run and a few other places where the gendisk capacity is changed. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2dc35b4c20ac..2b521ee67dfa 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4858,6 +4858,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; + revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -5058,7 +5059,6 @@ static void end_reshape(raid5_conf_t *conf) */ static void raid5_finish_reshape(mddev_t *mddev) { - struct block_device *bdev; raid5_conf_t *conf = mddev->private; if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -5067,15 +5067,7 @@ static void raid5_finish_reshape(mddev_t *mddev) md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } + revalidate_disk(mddev->gendisk); } else { int d; mddev->degraded = conf->raid_disks; -- cgit From 67ac6011db5d2b0c853d573ff474b25c85dfb644 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 10:06:24 +1000 Subject: md/raid5: allow new reshape modes to be restarted in the middle. md/raid5 doesn't allow a reshape to restart if it involves writing over the same part of disk that it would be reading from. This happens at the beginning of a reshape that increases the number of devices, at the end of a reshape that decreases the number of devices, and continuously for a reshape that does not change the number of devices. The current code is correct for the "increase number of devices" case as the critical section at the start is handled by userspace performing a backup. It does not work for reducing the number of devices, or the no-change case. For 'reducing', we need to invert the test. For no-change we cannot really be sure things will be safe, so simply require the array to be read-only, which is how the user-space code which carefully starts such arrays works. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2b521ee67dfa..b8a22a2205cf 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4509,7 +4509,26 @@ static int run(mddev_t *mddev) (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ - if (here_new >= here_old) { + if (mddev->delta_disks == 0) { + /* We cannot be sure it is safe to start an in-place + * reshape. It is only safe if user-space if monitoring + * and taking constant backups. + * mdadm always starts a situation like this in + * readonly mode so it can take control before + * allowing any writes. So just check for that. + */ + if ((here_new * mddev->new_chunk_sectors != + here_old * mddev->chunk_sectors) || + mddev->ro == 0) { + printk(KERN_ERR "raid5: in-place reshape must be started" + " in read-only mode - aborting\n"); + return -EINVAL; + } + } else if (mddev->delta_disks < 0 + ? (here_new * mddev->new_chunk_sectors <= + here_old * mddev->chunk_sectors) + : (here_new * mddev->new_chunk_sectors >= + here_old * mddev->chunk_sectors)) { /* Reading from the same stripe as writing to - bad */ printk(KERN_ERR "raid5: reshape_position too early for " "auto-recovery - aborting.\n"); -- cgit From a639755cf885e437b2fe4168d35157fa90d530ab Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 10:13:00 +1000 Subject: md/raid5: make sure a reshape restarts at the correct address. This "if" don't allow for the possibility that the number of devices doesn't change, and so sector_nr isn't set correctly in that case. So change '>' to '>='. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b8a22a2205cf..94a74cb5cccb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3785,7 +3785,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped conf->reshape_progress < raid5_size(mddev, 0, 0)) { sector_nr = raid5_size(mddev, 0, 0) - conf->reshape_progress; - } else if (mddev->delta_disks > 0 && + } else if (mddev->delta_disks >= 0 && conf->reshape_progress > 0) sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); -- cgit From 1a67dde0abba36421a1257d01ba9de2f6d1c160a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 10:41:49 +1000 Subject: md/raid5: Properly remove excess drives after shrinking a raid5/6 We were removing the drives, from the array, but not removing symlinks from /sys/.... and not marking the device as having been removed. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 94a74cb5cccb..b8a2c5dc67ba 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5097,8 +5097,15 @@ static void raid5_finish_reshape(mddev_t *mddev) mddev->degraded--; for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; - d++) - raid5_remove_disk(mddev, d); + d++) { + mdk_rdev_t *rdev = conf->disks[d].rdev; + if (rdev && raid5_remove_disk(mddev, d) == 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + rdev->raid_disk = -1; + } + } } mddev->layout = conf->algorithm; mddev->chunk_sectors = conf->chunk_sectors; -- cgit