summaryrefslogtreecommitdiff
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c469
1 files changed, 258 insertions, 211 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d874abfc1836..ca5b0e8ba707 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -36,7 +36,6 @@
*/
#include <linux/blkdev.h>
-#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/async_tx.h>
@@ -156,7 +155,7 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
return slot;
}
-static void print_raid5_conf (struct r5conf *conf);
+static void print_raid5_conf(struct r5conf *conf);
static int stripe_operations_active(struct stripe_head *sh)
{
@@ -907,8 +906,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
- !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
- is_full_stripe_write(sh);
+ is_full_stripe_write(sh);
}
/* we only do back search */
@@ -1242,10 +1240,6 @@ again:
}
if (rdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
@@ -1302,10 +1296,6 @@ again:
submit_bio_noacct(bi);
}
if (rrdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);
@@ -1346,8 +1336,6 @@ again:
submit_bio_noacct(rbi);
}
if (!rdev && !rrdev) {
- if (op_is_write(op))
- set_bit(STRIPE_DEGRADED, &sh->state);
pr_debug("skip op %d on disc %d for sector %llu\n",
bi->bi_opf, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -2338,7 +2326,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_Overlap, &dev->flags))
- wake_up(&sh->raid_conf->wait_for_overlap);
+ wake_up_bit(&dev->flags, R5_Overlap);
}
}
local_unlock(&conf->percpu->lock);
@@ -2885,7 +2873,6 @@ static void raid5_end_write_request(struct bio *bi)
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (bi->bi_status) {
- set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@ -3474,7 +3461,7 @@ static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
* With PPL only writes to consecutive data chunks within a
* stripe are allowed because for a single stripe_head we can
* only have one PPL entry at a time, which describes one data
- * range. Not really an overlap, but wait_for_overlap can be
+ * range. Not really an overlap, but R5_Overlap can be
* used to handle this.
*/
sector_t sector;
@@ -3549,29 +3536,9 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
(*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
sh->dev[dd_idx].sector);
- if (conf->mddev->bitmap && firstwrite) {
- /* Cannot hold spinlock over bitmap_startwrite,
- * but must ensure this isn't added to a batch until
- * we have added to the bitmap and set bm_seq.
- * So set STRIPE_BITMAP_PENDING to prevent
- * batching.
- * If multiple __add_stripe_bio() calls race here they
- * much all set STRIPE_BITMAP_PENDING. So only the first one
- * to complete "bitmap_startwrite" gets to set
- * STRIPE_BIT_DELAY. This is important as once a stripe
- * is added to a batch, STRIPE_BIT_DELAY cannot be changed
- * any more.
- */
- set_bit(STRIPE_BITMAP_PENDING, &sh->state);
- spin_unlock_irq(&sh->stripe_lock);
- md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf), 0);
- spin_lock_irq(&sh->stripe_lock);
- clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
- if (!sh->batch_head) {
- sh->bm_seq = conf->seq_flush+1;
- set_bit(STRIPE_BIT_DELAY, &sh->state);
- }
+ if (conf->mddev->bitmap && firstwrite && !sh->batch_head) {
+ sh->bm_seq = conf->seq_flush+1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
}
}
@@ -3622,7 +3589,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
for (i = disks; i--; ) {
struct bio *bi;
- int bitmap_end = 0;
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
struct md_rdev *rdev = conf->disks[i].rdev;
@@ -3647,13 +3613,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].towrite = NULL;
sh->overwrite_disks = 0;
spin_unlock_irq(&sh->stripe_lock);
- if (bi)
- bitmap_end = 1;
log_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[i].flags, R5_Overlap);
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
@@ -3663,10 +3627,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bio_io_error(bi);
bi = nextbi;
}
- if (bitmap_end)
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf), 0, 0);
- bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
@@ -3675,7 +3635,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].page = sh->dev[i].orig_page;
}
- if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
@@ -3697,7 +3656,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].toread = NULL;
spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[i].flags, R5_Overlap);
if (bi)
s->to_read--;
while (bi && bi->bi_iter.bi_sector <
@@ -3709,9 +3668,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi = nextbi;
}
}
- if (bitmap_end)
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf), 0, 0);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -3735,7 +3691,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
s->syncing = 0;
s->replacing = 0;
/* There is nothing more to do for sync/check/repair.
@@ -4060,10 +4016,7 @@ returnbi:
bio_endio(wbi);
wbi = wbi2;
}
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- 0);
+
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
@@ -4340,7 +4293,6 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
s->locked++;
set_bit(R5_Wantwrite, &dev->flags);
- clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(STRIPE_INSYNC, &sh->state);
break;
case check_state_run:
@@ -4497,7 +4449,6 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
clear_bit(R5_Wantwrite, &dev->flags);
s->locked--;
}
- clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(STRIPE_INSYNC, &sh->state);
break;
@@ -4723,14 +4674,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev) {
is_bad = rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf));
- if (s->blocked_rdev == NULL
- && (test_bit(Blocked, &rdev->flags)
- || is_bad < 0)) {
+ if (s->blocked_rdev == NULL) {
if (is_bad < 0)
- set_bit(BlockedBadBlocks,
- &rdev->flags);
- s->blocked_rdev = rdev;
- atomic_inc(&rdev->nr_pending);
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ if (rdev_blocked(rdev)) {
+ s->blocked_rdev = rdev;
+ atomic_inc(&rdev->nr_pending);
+ }
}
}
clear_bit(R5_Insync, &dev->flags);
@@ -4876,7 +4826,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
{
struct stripe_head *sh, *next;
int i;
- int do_wakeup = 0;
list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
@@ -4892,8 +4841,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
(1 << STRIPE_COMPUTE_RUN) |
(1 << STRIPE_DISCARD) |
(1 << STRIPE_BATCH_READY) |
- (1 << STRIPE_BATCH_ERR) |
- (1 << STRIPE_BITMAP_PENDING)),
+ (1 << STRIPE_BATCH_ERR)),
"stripe state: %lx\n", sh->state);
WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
(1 << STRIPE_REPLACED)),
@@ -4901,7 +4849,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
(1 << STRIPE_PREREAD_ACTIVE) |
- (1 << STRIPE_DEGRADED) |
(1 << STRIPE_ON_UNPLUG_LIST)),
head_sh->state & (1 << STRIPE_INSYNC));
@@ -4912,7 +4859,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
spin_unlock_irq(&sh->stripe_lock);
for (i = 0; i < sh->disks; i++) {
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- do_wakeup = 1;
+ wake_up_bit(&sh->dev[i].flags, R5_Overlap);
sh->dev[i].flags = head_sh->dev[i].flags &
(~((1 << R5_WriteError) | (1 << R5_Overlap)));
}
@@ -4926,12 +4873,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
spin_unlock_irq(&head_sh->stripe_lock);
for (i = 0; i < head_sh->disks; i++)
if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
- do_wakeup = 1;
+ wake_up_bit(&head_sh->dev[i].flags, R5_Overlap);
if (head_sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &head_sh->state);
-
- if (do_wakeup)
- wake_up(&head_sh->raid_conf->wait_for_overlap);
}
static void handle_stripe(struct stripe_head *sh)
@@ -5197,7 +5141,7 @@ static void handle_stripe(struct stripe_head *sh)
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
}
/* If the failed drives are just a ReadError, then we might need
@@ -5260,7 +5204,7 @@ static void handle_stripe(struct stripe_head *sh)
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
}
@@ -5754,12 +5698,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
int d;
again:
sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0);
- prepare_to_wait(&conf->wait_for_overlap, &w,
- TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
raid5_release_stripe(sh);
- schedule();
+ wait_on_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap,
+ TASK_UNINTERRUPTIBLE);
goto again;
}
clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
@@ -5771,12 +5714,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
set_bit(R5_Overlap, &sh->dev[d].flags);
spin_unlock_irq(&sh->stripe_lock);
raid5_release_stripe(sh);
- schedule();
+ wait_on_bit(&sh->dev[d].flags, R5_Overlap,
+ TASK_UNINTERRUPTIBLE);
goto again;
}
}
set_bit(STRIPE_DISCARD, &sh->state);
- finish_wait(&conf->wait_for_overlap, &w);
sh->overwrite_disks = 0;
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
@@ -5789,13 +5732,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
- for (d = 0;
- d < conf->raid_disks - conf->max_degraded;
- d++)
- md_bitmap_startwrite(mddev->bitmap,
- sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- 0);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
@@ -5856,7 +5792,6 @@ static int add_all_stripe_bios(struct r5conf *conf,
struct bio *bi, int forwrite, int previous)
{
int dd_idx;
- int ret = 1;
spin_lock_irq(&sh->stripe_lock);
@@ -5872,14 +5807,19 @@ static int add_all_stripe_bios(struct r5conf *conf,
if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
set_bit(R5_Overlap, &dev->flags);
- ret = 0;
- continue;
+ spin_unlock_irq(&sh->stripe_lock);
+ raid5_release_stripe(sh);
+ /* release batch_last before wait to avoid risk of deadlock */
+ if (ctx->batch_last) {
+ raid5_release_stripe(ctx->batch_last);
+ ctx->batch_last = NULL;
+ }
+ md_wakeup_thread(conf->mddev->thread);
+ wait_on_bit(&dev->flags, R5_Overlap, TASK_UNINTERRUPTIBLE);
+ return 0;
}
}
- if (!ret)
- goto out;
-
for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
struct r5dev *dev = &sh->dev[dd_idx];
@@ -5895,9 +5835,92 @@ static int add_all_stripe_bios(struct r5conf *conf,
RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
}
-out:
spin_unlock_irq(&sh->stripe_lock);
- return ret;
+ return 1;
+}
+
+enum reshape_loc {
+ LOC_NO_RESHAPE,
+ LOC_AHEAD_OF_RESHAPE,
+ LOC_INSIDE_RESHAPE,
+ LOC_BEHIND_RESHAPE,
+};
+
+static enum reshape_loc get_reshape_loc(struct mddev *mddev,
+ struct r5conf *conf, sector_t logical_sector)
+{
+ sector_t reshape_progress, reshape_safe;
+
+ if (likely(conf->reshape_progress == MaxSector))
+ return LOC_NO_RESHAPE;
+ /*
+ * Spinlock is needed as reshape_progress may be
+ * 64bit on a 32bit platform, and so it might be
+ * possible to see a half-updated value
+ * Of course reshape_progress could change after
+ * the lock is dropped, so once we get a reference
+ * to the stripe that we think it is, we will have
+ * to check again.
+ */
+ spin_lock_irq(&conf->device_lock);
+ reshape_progress = conf->reshape_progress;
+ reshape_safe = conf->reshape_safe;
+ spin_unlock_irq(&conf->device_lock);
+ if (reshape_progress == MaxSector)
+ return LOC_NO_RESHAPE;
+ if (ahead_of_reshape(mddev, logical_sector, reshape_progress))
+ return LOC_AHEAD_OF_RESHAPE;
+ if (ahead_of_reshape(mddev, logical_sector, reshape_safe))
+ return LOC_INSIDE_RESHAPE;
+ return LOC_BEHIND_RESHAPE;
+}
+
+static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset,
+ unsigned long *sectors)
+{
+ struct r5conf *conf = mddev->private;
+ sector_t start = *offset;
+ sector_t end = start + *sectors;
+ sector_t prev_start = start;
+ sector_t prev_end = end;
+ int sectors_per_chunk;
+ enum reshape_loc loc;
+ int dd_idx;
+
+ sectors_per_chunk = conf->chunk_sectors *
+ (conf->raid_disks - conf->max_degraded);
+ start = round_down(start, sectors_per_chunk);
+ end = round_up(end, sectors_per_chunk);
+
+ start = raid5_compute_sector(conf, start, 0, &dd_idx, NULL);
+ end = raid5_compute_sector(conf, end, 0, &dd_idx, NULL);
+
+ /*
+ * For LOC_INSIDE_RESHAPE, this IO will wait for reshape to make
+ * progress, hence it's the same as LOC_BEHIND_RESHAPE.
+ */
+ loc = get_reshape_loc(mddev, conf, prev_start);
+ if (likely(loc != LOC_AHEAD_OF_RESHAPE)) {
+ *offset = start;
+ *sectors = end - start;
+ return;
+ }
+
+ sectors_per_chunk = conf->prev_chunk_sectors *
+ (conf->previous_raid_disks - conf->max_degraded);
+ prev_start = round_down(prev_start, sectors_per_chunk);
+ prev_end = round_down(prev_end, sectors_per_chunk);
+
+ prev_start = raid5_compute_sector(conf, prev_start, 1, &dd_idx, NULL);
+ prev_end = raid5_compute_sector(conf, prev_end, 1, &dd_idx, NULL);
+
+ /*
+ * for LOC_AHEAD_OF_RESHAPE, reshape can make progress before this IO
+ * is handled in make_stripe_request(), we can't know this here hence
+ * we set bits for both.
+ */
+ *offset = min(start, prev_start);
+ *sectors = max(end, prev_end) - *offset;
}
static enum stripe_result make_stripe_request(struct mddev *mddev,
@@ -5907,36 +5930,19 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
const int rw = bio_data_dir(bi);
enum stripe_result ret;
struct stripe_head *sh;
+ enum reshape_loc loc;
sector_t new_sector;
int previous = 0, flags = 0;
int seq, dd_idx;
seq = read_seqcount_begin(&conf->gen_lock);
-
- if (unlikely(conf->reshape_progress != MaxSector)) {
- /*
- * Spinlock is needed as reshape_progress may be
- * 64bit on a 32bit platform, and so it might be
- * possible to see a half-updated value
- * Of course reshape_progress could change after
- * the lock is dropped, so once we get a reference
- * to the stripe that we think it is, we will have
- * to check again.
- */
- spin_lock_irq(&conf->device_lock);
- if (ahead_of_reshape(mddev, logical_sector,
- conf->reshape_progress)) {
- previous = 1;
- } else {
- if (ahead_of_reshape(mddev, logical_sector,
- conf->reshape_safe)) {
- spin_unlock_irq(&conf->device_lock);
- ret = STRIPE_SCHEDULE_AND_RETRY;
- goto out;
- }
- }
- spin_unlock_irq(&conf->device_lock);
+ loc = get_reshape_loc(mddev, conf, logical_sector);
+ if (loc == LOC_INSIDE_RESHAPE) {
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out;
}
+ if (loc == LOC_AHEAD_OF_RESHAPE)
+ previous = 1;
new_sector = raid5_compute_sector(conf, logical_sector, previous,
&dd_idx, NULL);
@@ -5974,17 +5980,17 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
goto out_release;
}
- if (test_bit(STRIPE_EXPANDING, &sh->state) ||
- !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
- /*
- * Stripe is busy expanding or add failed due to
- * overlap. Flush everything and wait a while.
- */
+ if (test_bit(STRIPE_EXPANDING, &sh->state)) {
md_wakeup_thread(mddev->thread);
ret = STRIPE_SCHEDULE_AND_RETRY;
goto out_release;
}
+ if (!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
+ ret = STRIPE_RETRY;
+ goto out;
+ }
+
if (stripe_can_batch(sh)) {
stripe_add_to_batch_list(conf, sh, ctx->batch_last);
if (ctx->batch_last)
@@ -6055,6 +6061,7 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ bool on_wq;
struct r5conf *conf = mddev->private;
sector_t logical_sector;
struct stripe_request_ctx ctx = {};
@@ -6079,8 +6086,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
}
- if (!md_write_start(mddev, bi))
- return false;
+ md_write_start(mddev, bi);
/*
* If array is degraded, better not do chunk aligned read because
* later we might have to read it again in order to reconstruct
@@ -6113,9 +6119,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
/* Bail out if conflicts with reshape and REQ_NOWAIT is set */
if ((bi->bi_opf & REQ_NOWAIT) &&
- (conf->reshape_progress != MaxSector) &&
- !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) &&
- ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) {
+ get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) {
bio_wouldblock_error(bi);
if (rw == WRITE)
md_write_end(mddev);
@@ -6130,11 +6134,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
* sequential IO pattern. We don't bother with the optimization when
* reshaping as the performance benefit is not worth the complexity.
*/
- if (likely(conf->reshape_progress == MaxSector))
+ if (likely(conf->reshape_progress == MaxSector)) {
logical_sector = raid5_bio_lowest_chunk_sector(conf, bi);
+ on_wq = false;
+ } else {
+ add_wait_queue(&conf->wait_for_reshape, &wait);
+ on_wq = true;
+ }
s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
- add_wait_queue(&conf->wait_for_overlap, &wait);
while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi);
@@ -6145,6 +6153,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
continue;
if (res == STRIPE_SCHEDULE_AND_RETRY) {
+ WARN_ON_ONCE(!on_wq);
/*
* Must release the reference to batch_last before
* scheduling and waiting for work to be done,
@@ -6169,7 +6178,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
logical_sector = ctx.first_sector +
(s << RAID5_STRIPE_SHIFT(conf));
}
- remove_wait_queue(&conf->wait_for_overlap, &wait);
+ if (unlikely(on_wq))
+ remove_wait_queue(&conf->wait_for_reshape, &wait);
if (ctx.batch_last)
raid5_release_stripe(ctx.batch_last);
@@ -6256,7 +6266,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
if (mddev->reshape_backwards) {
- BUG_ON(writepos < reshape_sectors);
+ if (WARN_ON(writepos < reshape_sectors))
+ return MaxSector;
+
writepos -= reshape_sectors;
readpos += reshape_sectors;
safepos += reshape_sectors;
@@ -6274,14 +6286,18 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
* to set 'stripe_addr' which is where we will write to.
*/
if (mddev->reshape_backwards) {
- BUG_ON(conf->reshape_progress == 0);
+ if (WARN_ON(conf->reshape_progress == 0))
+ return MaxSector;
+
stripe_addr = writepos;
- BUG_ON((mddev->dev_sectors &
- ~((sector_t)reshape_sectors - 1))
- - reshape_sectors - stripe_addr
- != sector_nr);
+ if (WARN_ON((mddev->dev_sectors &
+ ~((sector_t)reshape_sectors - 1)) -
+ reshape_sectors - stripe_addr != sector_nr))
+ return MaxSector;
} else {
- BUG_ON(writepos != sector_nr + reshape_sectors);
+ if (WARN_ON(writepos != sector_nr + reshape_sectors))
+ return MaxSector;
+
stripe_addr = sector_nr;
}
@@ -6316,7 +6332,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
: (safepos < writepos && readpos > writepos)) ||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
/* Cannot proceed until we've updated the superblock... */
- wait_event(conf->wait_for_overlap,
+ wait_event(conf->wait_for_reshape,
atomic_read(&conf->reshape_stripes)==0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
@@ -6342,7 +6358,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
@@ -6425,7 +6441,7 @@ finish:
(sector_nr - mddev->curr_resync_completed) * 2
>= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
- wait_event(conf->wait_for_overlap,
+ wait_event(conf->wait_for_reshape,
atomic_read(&conf->reshape_stripes) == 0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
@@ -6451,7 +6467,7 @@ finish:
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
ret:
@@ -6459,13 +6475,12 @@ ret:
}
static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
- int *skipped)
+ sector_t max_sector, int *skipped)
{
struct r5conf *conf = mddev->private;
struct stripe_head *sh;
- sector_t max_sector = mddev->dev_sectors;
sector_t sync_blocks;
- int still_degraded = 0;
+ bool still_degraded = false;
int i;
if (sector_nr >= max_sector) {
@@ -6477,17 +6492,17 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (mddev->curr_resync < max_sector) /* aborted */
- md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
- &sync_blocks, 1);
+ mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else /* completed sync */
conf->fullsync = 0;
- md_bitmap_close_sync(mddev->bitmap);
+ mddev->bitmap_ops->close_sync(mddev);
return 0;
}
/* Allow raid5_quiesce to complete */
- wait_event(conf->wait_for_overlap, conf->quiesce != 2);
+ wait_event(conf->wait_for_reshape, conf->quiesce != 2);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
@@ -6510,7 +6525,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
- !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
+ true) &&
sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
@@ -6519,7 +6535,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
- md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
sh = raid5_get_active_stripe(conf, NULL, sector_nr,
R5_GAS_NOBLOCK);
@@ -6538,10 +6554,11 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
struct md_rdev *rdev = conf->disks[i].rdev;
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
- still_degraded = 1;
+ still_degraded = true;
}
- md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
+ mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
+ still_degraded);
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -6734,6 +6751,9 @@ static void raid5d(struct md_thread *thread)
int batch_size, released;
unsigned int offset;
+ if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+ break;
+
released = release_stripe_list(conf, conf->temp_inactive_list);
if (released)
clear_bit(R5_DID_ALLOC, &conf->cache_state);
@@ -6743,7 +6763,7 @@ static void raid5d(struct md_thread *thread)
/* Now is a good time to flush some bitmap updates */
conf->seq_flush++;
spin_unlock_irq(&conf->device_lock);
- md_bitmap_unplug(mddev->bitmap);
+ mddev->bitmap_ops->unplug(mddev, true);
spin_lock_irq(&conf->device_lock);
conf->seq_write = conf->seq_flush;
activate_bit_delay(conf, conf->temp_inactive_list);
@@ -6770,18 +6790,7 @@ static void raid5d(struct md_thread *thread)
spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev);
spin_lock_irq(&conf->device_lock);
-
- /*
- * Waiting on MD_SB_CHANGE_PENDING below may deadlock
- * seeing md_check_recovery() is needed to clear
- * the flag when using mdmon.
- */
- continue;
}
-
- wait_event_lock_irq(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
- conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@@ -7091,12 +7100,14 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
err = -ENODEV;
else if (new != conf->skip_copy) {
struct request_queue *q = mddev->gendisk->queue;
+ struct queue_limits lim = queue_limits_start_update(q);
conf->skip_copy = new;
if (new)
- blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
+ lim.features |= BLK_FEAT_STABLE_WRITES;
else
- blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
+ lim.features &= ~BLK_FEAT_STABLE_WRITES;
+ err = queue_limits_commit_update(q, &lim);
}
mddev_unlock_and_resume(mddev);
return err ?: len;
@@ -7156,6 +7167,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
err = mddev_suspend_and_lock(mddev);
if (err)
return err;
+ raid5_quiesce(mddev, true);
+
conf = mddev->private;
if (!conf)
err = -ENODEV;
@@ -7177,6 +7190,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
kfree(old_groups);
}
}
+
+ raid5_quiesce(mddev, false);
mddev_unlock_and_resume(mddev);
return err ?: len;
@@ -7477,7 +7492,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
- init_waitqueue_head(&conf->wait_for_overlap);
+ init_waitqueue_head(&conf->wait_for_reshape);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->loprio_list);
INIT_LIST_HEAD(&conf->hold_list);
@@ -7571,11 +7586,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (test_bit(Replacement, &rdev->flags)) {
if (disk->replacement)
goto abort;
- RCU_INIT_POINTER(disk->replacement, rdev);
+ disk->replacement = rdev;
} else {
if (disk->rdev)
goto abort;
- RCU_INIT_POINTER(disk->rdev, rdev);
+ disk->rdev = rdev;
}
if (test_bit(In_sync, &rdev->flags)) {
@@ -7711,13 +7726,13 @@ static int raid5_set_limits(struct mddev *mddev)
*/
stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
- blk_set_stacking_limits(&lim);
+ md_init_stacking_limits(&lim);
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
- lim.raid_partial_stripes_expensive = 1;
+ lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE;
lim.discard_granularity = stripe;
lim.max_write_zeroes_sectors = 0;
- mddev_stack_rdev_limits(mddev, &lim);
+ mddev_stack_rdev_limits(mddev, &lim, 0);
rdev_for_each(rdev, mddev)
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset,
mddev->gendisk->disk_name);
@@ -8057,7 +8072,7 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
seq_printf (seq, "]");
}
-static void print_raid5_conf (struct r5conf *conf)
+static void print_raid5_conf(struct r5conf *conf)
{
struct md_rdev *rdev;
int i;
@@ -8071,15 +8086,13 @@ static void print_raid5_conf (struct r5conf *conf)
conf->raid_disks,
conf->raid_disks - conf->mddev->degraded);
- rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- rdev = rcu_dereference(conf->disks[i].rdev);
+ rdev = conf->disks[i].rdev;
if (rdev)
pr_debug(" disk %d, o:%d, dev:%pg\n",
i, !test_bit(Faulty, &rdev->flags),
rdev->bdev);
}
- rcu_read_unlock();
}
static int raid5_spare_active(struct mddev *mddev)
@@ -8299,6 +8312,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
*/
sector_t newsize;
struct r5conf *conf = mddev->private;
+ int ret;
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return -EINVAL;
@@ -8307,11 +8321,11 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
- if (mddev->bitmap) {
- int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
- if (ret)
- return ret;
- }
+
+ ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false);
+ if (ret)
+ return ret;
+
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
@@ -8537,7 +8551,7 @@ static void end_reshape(struct r5conf *conf)
!test_bit(In_sync, &rdev->flags))
rdev->recovery_offset = MaxSector;
spin_unlock_irq(&conf->device_lock);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
mddev_update_io_opt(conf->mddev,
conf->raid_disks - conf->max_degraded);
@@ -8601,13 +8615,13 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
conf->quiesce = 1;
unlock_all_device_hash_locks_irq(conf);
/* allow reshape to continue */
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
} else {
/* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
wake_up(&conf->wait_for_quiescent);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
unlock_all_device_hash_locks_irq(conf);
}
log_quiesce(conf, quiesce);
@@ -8926,14 +8940,18 @@ static void raid5_prepare_suspend(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
}
static struct md_personality raid6_personality =
{
- .name = "raid6",
- .level = 6,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID6,
+ .name = "raid6",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -8953,12 +8971,17 @@ static struct md_personality raid6_personality =
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static struct md_personality raid5_personality =
{
- .name = "raid5",
- .level = 5,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID5,
+ .name = "raid5",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -8978,13 +9001,18 @@ static struct md_personality raid5_personality =
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static struct md_personality raid4_personality =
{
- .name = "raid4",
- .level = 4,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID4,
+ .name = "raid4",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -9004,6 +9032,7 @@ static struct md_personality raid4_personality =
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static int __init raid5_init(void)
@@ -9019,21 +9048,39 @@ static int __init raid5_init(void)
"md/raid5:prepare",
raid456_cpu_up_prepare,
raid456_cpu_dead);
- if (ret) {
- destroy_workqueue(raid5_wq);
- return ret;
- }
- register_md_personality(&raid6_personality);
- register_md_personality(&raid5_personality);
- register_md_personality(&raid4_personality);
+ if (ret)
+ goto err_destroy_wq;
+
+ ret = register_md_submodule(&raid6_personality.head);
+ if (ret)
+ goto err_cpuhp_remove;
+
+ ret = register_md_submodule(&raid5_personality.head);
+ if (ret)
+ goto err_unregister_raid6;
+
+ ret = register_md_submodule(&raid4_personality.head);
+ if (ret)
+ goto err_unregister_raid5;
+
return 0;
+
+err_unregister_raid5:
+ unregister_md_submodule(&raid5_personality.head);
+err_unregister_raid6:
+ unregister_md_submodule(&raid6_personality.head);
+err_cpuhp_remove:
+ cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
+err_destroy_wq:
+ destroy_workqueue(raid5_wq);
+ return ret;
}
-static void raid5_exit(void)
+static void __exit raid5_exit(void)
{
- unregister_md_personality(&raid6_personality);
- unregister_md_personality(&raid5_personality);
- unregister_md_personality(&raid4_personality);
+ unregister_md_submodule(&raid6_personality.head);
+ unregister_md_submodule(&raid5_personality.head);
+ unregister_md_submodule(&raid4_personality.head);
cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
destroy_workqueue(raid5_wq);
}