summaryrefslogtreecommitdiff
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c218
1 files changed, 125 insertions, 93 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 48129de21aec..f2e3c3e2d879 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -36,6 +36,7 @@
*/
#include <linux/blkdev.h>
+#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/async_tx.h>
@@ -760,6 +761,7 @@ enum stripe_result {
STRIPE_RETRY,
STRIPE_SCHEDULE_AND_RETRY,
STRIPE_FAIL,
+ STRIPE_WAIT_RESHAPE,
};
struct stripe_request_ctx {
@@ -1293,10 +1295,7 @@ again:
if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
- if (conf->mddev->gendisk)
- trace_block_bio_remap(bi,
- disk_devt(conf->mddev->gendisk),
- sh->dev[i].sector);
+ mddev_trace_remap(conf->mddev, bi, sh->dev[i].sector);
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, bi);
else
@@ -1340,10 +1339,7 @@ again:
*/
if (op == REQ_OP_DISCARD)
rbi->bi_vcnt = 0;
- if (conf->mddev->gendisk)
- trace_block_bio_remap(rbi,
- disk_devt(conf->mddev->gendisk),
- sh->dev[i].sector);
+ mddev_trace_remap(conf->mddev, rbi, sh->dev[i].sector);
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, rbi);
else
@@ -2420,12 +2416,12 @@ static int grow_stripes(struct r5conf *conf, int num)
size_t namelen = sizeof(conf->cache_name[0]);
int devs = max(conf->raid_disks, conf->previous_raid_disks);
- if (conf->mddev->gendisk)
+ if (mddev_is_dm(conf->mddev))
snprintf(conf->cache_name[0], namelen,
- "raid%d-%s", conf->level, mdname(conf->mddev));
+ "raid%d-%p", conf->level, conf->mddev);
else
snprintf(conf->cache_name[0], namelen,
- "raid%d-%p", conf->level, conf->mddev);
+ "raid%d-%s", conf->level, mdname(conf->mddev));
snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
conf->active_name = 0;
@@ -4199,10 +4195,9 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
- if (conf->mddev->queue)
- blk_add_trace_msg(conf->mddev->queue,
- "raid5 rmw %llu %d",
- (unsigned long long)sh->sector, rmw);
+ mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d",
+ sh->sector, rmw);
+
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_InJournal, &dev->flags) &&
@@ -4279,10 +4274,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_DELAYED, &sh->state);
}
}
- if (rcw && conf->mddev->queue)
- blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
- (unsigned long long)sh->sector,
- rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
+ if (rcw && !mddev_is_dm(conf->mddev))
+ blk_add_trace_msg(conf->mddev->gendisk->queue,
+ "raid5 rcw %llu %d %d %d",
+ (unsigned long long)sh->sector, rcw, qread,
+ test_bit(STRIPE_DELAYED, &sh->state));
}
if (rcw > disks && rmw > disks &&
@@ -5521,9 +5517,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
spin_unlock_irq(&conf->device_lock);
}
- if (mddev->gendisk)
- trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
- raid_bio->bi_iter.bi_sector);
+ mddev_trace_remap(mddev, align_bio, raid_bio->bi_iter.bi_sector);
submit_bio_noacct(align_bio);
return 1;
}
@@ -5692,8 +5686,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
}
release_inactive_stripe_list(conf, cb->temp_inactive_list,
NR_STRIPE_HASH_LOCKS);
- if (mddev->queue)
- trace_block_unplug(mddev->queue, cnt, !from_schedule);
+ if (!mddev_is_dm(mddev))
+ trace_block_unplug(mddev->gendisk->queue, cnt, !from_schedule);
kfree(cb);
}
@@ -5937,7 +5931,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_safe)) {
spin_unlock_irq(&conf->device_lock);
- return STRIPE_SCHEDULE_AND_RETRY;
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out;
}
}
spin_unlock_irq(&conf->device_lock);
@@ -6016,6 +6011,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
out_release:
raid5_release_stripe(sh);
+out:
+ if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) {
+ bi->bi_status = BLK_STS_RESOURCE;
+ ret = STRIPE_WAIT_RESHAPE;
+ pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress");
+ }
return ret;
}
@@ -6137,7 +6138,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi);
- if (res == STRIPE_FAIL)
+ if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE)
break;
if (res == STRIPE_RETRY)
@@ -6175,6 +6176,11 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
if (rw == WRITE)
md_write_end(mddev);
+ if (res == STRIPE_WAIT_RESHAPE) {
+ md_free_cloned_bio(bi);
+ return false;
+ }
+
bio_endio(bi);
return true;
}
@@ -6764,7 +6770,18 @@ static void raid5d(struct md_thread *thread)
spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev);
spin_lock_irq(&conf->device_lock);
+
+ /*
+ * Waiting on MD_SB_CHANGE_PENDING below may deadlock
+ * seeing md_check_recovery() is needed to clear
+ * the flag when using mdmon.
+ */
+ continue;
}
+
+ wait_event_lock_irq(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+ conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@@ -7073,7 +7090,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
if (!conf)
err = -ENODEV;
else if (new != conf->skip_copy) {
- struct request_queue *q = mddev->queue;
+ struct request_queue *q = mddev->gendisk->queue;
conf->skip_copy = new;
if (new)
@@ -7675,10 +7692,65 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
return 0;
}
-static void raid5_set_io_opt(struct r5conf *conf)
+static int raid5_set_limits(struct mddev *mddev)
{
- blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
- (conf->raid_disks - conf->max_degraded));
+ struct r5conf *conf = mddev->private;
+ struct queue_limits lim;
+ int data_disks, stripe;
+ struct md_rdev *rdev;
+
+ /*
+ * The read-ahead size must cover two whole stripes, which is
+ * 2 * (datadisks) * chunksize where 'n' is the number of raid devices.
+ */
+ data_disks = conf->previous_raid_disks - conf->max_degraded;
+
+ /*
+ * We can only discard a whole stripe. It doesn't make sense to
+ * discard data disk but write parity disk
+ */
+ stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
+
+ blk_set_stacking_limits(&lim);
+ lim.io_min = mddev->chunk_sectors << 9;
+ lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
+ lim.raid_partial_stripes_expensive = 1;
+ lim.discard_granularity = stripe;
+ lim.max_write_zeroes_sectors = 0;
+ mddev_stack_rdev_limits(mddev, &lim);
+ rdev_for_each(rdev, mddev)
+ queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset,
+ mddev->gendisk->disk_name);
+
+ /*
+ * Zeroing is required for discard, otherwise data could be lost.
+ *
+ * Consider a scenario: discard a stripe (the stripe could be
+ * inconsistent if discard_zeroes_data is 0); write one disk of the
+ * stripe (the stripe could be inconsistent again depending on which
+ * disks are used to calculate parity); the disk is broken; The stripe
+ * data of this disk is lost.
+ *
+ * We only allow DISCARD if the sysadmin has confirmed that only safe
+ * devices are in use by setting a module parameter. A better idea
+ * might be to turn DISCARD into WRITE_ZEROES requests, as that is
+ * required to be safe.
+ */
+ if (!devices_handle_discard_safely ||
+ lim.max_discard_sectors < (stripe >> 9) ||
+ lim.discard_granularity < stripe)
+ lim.max_hw_discard_sectors = 0;
+
+ /*
+ * Requests require having a bitmap for each stripe.
+ * Limit the max sectors based on this.
+ */
+ lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf);
+
+ /* No restrictions on the number of segments in the request */
+ lim.max_segments = USHRT_MAX;
+
+ return queue_limits_set(mddev->gendisk->queue, &lim);
}
static int raid5_run(struct mddev *mddev)
@@ -7691,6 +7763,7 @@ static int raid5_run(struct mddev *mddev)
int i;
long long min_offset_diff = 0;
int first = 1;
+ int ret = -EIO;
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@@ -7943,66 +8016,10 @@ static int raid5_run(struct mddev *mddev)
mdname(mddev));
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
- if (mddev->queue) {
- int chunk_size;
- /* read-ahead size must cover two whole stripes, which
- * is 2 * (datadisks) * chunksize where 'n' is the
- * number of raid devices
- */
- int data_disks = conf->previous_raid_disks - conf->max_degraded;
- int stripe = data_disks *
- ((mddev->chunk_sectors << 9) / PAGE_SIZE);
-
- chunk_size = mddev->chunk_sectors << 9;
- blk_queue_io_min(mddev->queue, chunk_size);
- raid5_set_io_opt(conf);
- mddev->queue->limits.raid_partial_stripes_expensive = 1;
- /*
- * We can only discard a whole stripe. It doesn't make sense to
- * discard data disk but write parity disk
- */
- stripe = stripe * PAGE_SIZE;
- stripe = roundup_pow_of_two(stripe);
- mddev->queue->limits.discard_granularity = stripe;
-
- blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
-
- rdev_for_each(rdev, mddev) {
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->new_data_offset << 9);
- }
-
- /*
- * zeroing is required, otherwise data
- * could be lost. Consider a scenario: discard a stripe
- * (the stripe could be inconsistent if
- * discard_zeroes_data is 0); write one disk of the
- * stripe (the stripe could be inconsistent again
- * depending on which disks are used to calculate
- * parity); the disk is broken; The stripe data of this
- * disk is lost.
- *
- * We only allow DISCARD if the sysadmin has confirmed that
- * only safe devices are in use by setting a module parameter.
- * A better idea might be to turn DISCARD into WRITE_ZEROES
- * requests, as that is required to be safe.
- */
- if (!devices_handle_discard_safely ||
- mddev->queue->limits.max_discard_sectors < (stripe >> 9) ||
- mddev->queue->limits.discard_granularity < stripe)
- blk_queue_max_discard_sectors(mddev->queue, 0);
-
- /*
- * Requests require having a bitmap for each stripe.
- * Limit the max sectors based on this.
- */
- blk_queue_max_hw_sectors(mddev->queue,
- RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
-
- /* No restrictions on the number of segments in the request */
- blk_queue_max_segments(mddev->queue, USHRT_MAX);
+ if (!mddev_is_dm(mddev)) {
+ ret = raid5_set_limits(mddev);
+ if (ret)
+ goto abort;
}
if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
@@ -8015,7 +8032,7 @@ abort:
free_conf(conf);
mddev->private = NULL;
pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
- return -EIO;
+ return ret;
}
static void raid5_free(struct mddev *mddev, void *priv)
@@ -8547,8 +8564,8 @@ static void end_reshape(struct r5conf *conf)
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
- if (conf->mddev->queue)
- raid5_set_io_opt(conf);
+ mddev_update_io_opt(conf->mddev,
+ conf->raid_disks - conf->max_degraded);
}
}
@@ -8925,6 +8942,18 @@ static int raid5_start(struct mddev *mddev)
return r5l_start(conf->log);
}
+/*
+ * This is only used for dm-raid456, caller already frozen sync_thread, hence
+ * if rehsape is still in progress, io that is waiting for reshape can never be
+ * done now, hence wake up and handle those IO.
+ */
+static void raid5_prepare_suspend(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+
+ wake_up(&conf->wait_for_overlap);
+}
+
static struct md_personality raid6_personality =
{
.name = "raid6",
@@ -8948,6 +8977,7 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
+ .prepare_suspend = raid5_prepare_suspend,
};
static struct md_personality raid5_personality =
{
@@ -8972,6 +9002,7 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
+ .prepare_suspend = raid5_prepare_suspend,
};
static struct md_personality raid4_personality =
@@ -8997,6 +9028,7 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
+ .prepare_suspend = raid5_prepare_suspend,
};
static int __init raid5_init(void)