diff options
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 877 |
1 files changed, 502 insertions, 375 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 286f8b16c7bd..10ea3af40991 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -46,9 +46,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr); static void lower_barrier(struct r1conf *conf, sector_t sector_nr); -#define raid1_log(md, fmt, args...) \ - do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) - #define RAID_1_10_NAME "raid1" #include "raid1-10.c" @@ -414,18 +411,18 @@ static void raid1_end_read_request(struct bio *bio) static void close_write(struct r1bio *r1_bio) { + struct mddev *mddev = r1_bio->mddev; + /* it really is the end of this request */ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { bio_free_pages(r1_bio->behind_master_bio); bio_put(r1_bio->behind_master_bio); r1_bio->behind_master_bio = NULL; } - /* clear the bitmap if all writes complete successfully */ - md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - test_bit(R1BIO_BehindIO, &r1_bio->state)); - md_write_end(r1_bio->mddev); + + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + mddev->bitmap_ops->end_behind_write(mddev); + md_write_end(mddev); } static void r1_bio_write_done(struct r1bio *r1_bio) @@ -481,8 +478,6 @@ static void raid1_end_write_request(struct bio *bio) if (!test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_WriteError, &r1_bio->state); else { - /* Fail the request */ - set_bit(R1BIO_Degraded, &r1_bio->state); /* Finished with this branch */ r1_bio->bios[mirror] = NULL; to_put = bio; @@ -498,9 +493,6 @@ static void raid1_end_write_request(struct bio *bio) * to user-side. So if something waits for IO, then it * will wait for the 'master' bio. */ - sector_t first_bad; - int bad_sectors; - r1_bio->bios[mirror] = NULL; to_put = bio; /* @@ -516,8 +508,8 @@ static void raid1_end_write_request(struct bio *bio) set_bit(R1BIO_Uptodate, &r1_bio->state); /* Maybe we can clear some bad blocks. */ - if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, - &first_bad, &bad_sectors) && !discard_error) { + if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && + !discard_error) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); } @@ -582,211 +574,319 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector, return len; } -/* - * This routine returns the disk from which the requested read should - * be done. There is a per-array 'next expected sequential IO' sector - * number - if this matches on the next IO then we use the last disk. - * There is also a per-disk 'last know head position' sector that is - * maintained from IRQ contexts, both the normal and the resync IO - * completion handlers update this position correctly. If there is no - * perfect sequential match then we pick the disk whose head is closest. - * - * If there are 2 mirrors in the same 2 devices, performance degrades - * because position is mirror, not device based. - * - * The rdev for the device selected will have nr_pending incremented. - */ -static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) +static void update_read_sectors(struct r1conf *conf, int disk, + sector_t this_sector, int len) { - const sector_t this_sector = r1_bio->sector; - int sectors; - int best_good_sectors; - int best_disk, best_dist_disk, best_pending_disk; - int has_nonrot_disk; + struct raid1_info *info = &conf->mirrors[disk]; + + atomic_inc(&info->rdev->nr_pending); + if (info->next_seq_sect != this_sector) + info->seq_start = this_sector; + info->next_seq_sect = this_sector + len; +} + +static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int len = r1_bio->sectors; int disk; - sector_t best_dist; - unsigned int min_pending; - struct md_rdev *rdev; - int choose_first; - int choose_next_idle; - /* - * Check if we can balance. We can balance on the whole - * device if no resync is going on, or below the resync window. - * We take the first readable disk when above the resync window. - */ - retry: - sectors = r1_bio->sectors; - best_disk = -1; - best_dist_disk = -1; - best_dist = MaxSector; - best_pending_disk = -1; - min_pending = UINT_MAX; - best_good_sectors = 0; - has_nonrot_disk = 0; - choose_next_idle = 0; - clear_bit(R1BIO_FailFast, &r1_bio->state); + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + int read_len; - if ((conf->mddev->recovery_cp < this_sector + sectors) || - (mddev_is_clustered(conf->mddev) && - md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, - this_sector + sectors))) - choose_first = 1; - else - choose_first = 0; + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; + + rdev = conf->mirrors[disk].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags)) + continue; + + /* choose the first disk even if it has some bad blocks. */ + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len > 0) { + update_read_sectors(conf, disk, this_sector, read_len); + *max_sectors = read_len; + return disk; + } + } + + return -1; +} + +static bool rdev_in_recovery(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + return !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < r1_bio->sector + r1_bio->sectors; +} + +static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int best_disk = -1; + int best_len = 0; + int disk; for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { - sector_t dist; - sector_t first_bad; - int bad_sectors; - unsigned int pending; - bool nonrot; + struct md_rdev *rdev; + int len; + int read_len; + + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; rdev = conf->mirrors[disk].rdev; - if (r1_bio->bios[disk] == IO_BLOCKED - || rdev == NULL - || test_bit(Faulty, &rdev->flags)) + if (!rdev || test_bit(Faulty, &rdev->flags) || + rdev_in_recovery(rdev, r1_bio) || + test_bit(WriteMostly, &rdev->flags)) continue; - if (!test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < this_sector + sectors) + + /* keep track of the disk with the most readable sectors. */ + len = r1_bio->sectors; + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len > best_len) { + best_disk = disk; + best_len = read_len; + } + } + + if (best_disk != -1) { + *max_sectors = best_len; + update_read_sectors(conf, best_disk, this_sector, best_len); + } + + return best_disk; +} + +static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int bb_disk = -1; + int bb_read_len = 0; + int disk; + + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + int len; + int read_len; + + if (r1_bio->bios[disk] == IO_BLOCKED) continue; - if (test_bit(WriteMostly, &rdev->flags)) { - /* Don't balance among write-mostly, just - * use the first as a last resort */ - if (best_dist_disk < 0) { - if (is_badblock(rdev, this_sector, sectors, - &first_bad, &bad_sectors)) { - if (first_bad <= this_sector) - /* Cannot use this */ - continue; - best_good_sectors = first_bad - this_sector; - } else - best_good_sectors = sectors; - best_dist_disk = disk; - best_pending_disk = disk; - } + + rdev = conf->mirrors[disk].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags) || + !test_bit(WriteMostly, &rdev->flags) || + rdev_in_recovery(rdev, r1_bio)) continue; + + /* there are no bad blocks, we can use this disk */ + len = r1_bio->sectors; + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len == r1_bio->sectors) { + *max_sectors = read_len; + update_read_sectors(conf, disk, this_sector, read_len); + return disk; } - /* This is a reasonable device to use. It might - * even be best. + + /* + * there are partial bad blocks, choose the rdev with largest + * read length. */ - if (is_badblock(rdev, this_sector, sectors, - &first_bad, &bad_sectors)) { - if (best_dist < MaxSector) - /* already have a better device */ - continue; - if (first_bad <= this_sector) { - /* cannot read here. If this is the 'primary' - * device, then we must not read beyond - * bad_sectors from another device.. - */ - bad_sectors -= (this_sector - first_bad); - if (choose_first && sectors > bad_sectors) - sectors = bad_sectors; - if (best_good_sectors > sectors) - best_good_sectors = sectors; - - } else { - sector_t good_sectors = first_bad - this_sector; - if (good_sectors > best_good_sectors) { - best_good_sectors = good_sectors; - best_disk = disk; - } - if (choose_first) - break; - } - continue; - } else { - if ((sectors > best_good_sectors) && (best_disk >= 0)) - best_disk = -1; - best_good_sectors = sectors; + if (read_len > bb_read_len) { + bb_disk = disk; + bb_read_len = read_len; } + } + + if (bb_disk != -1) { + *max_sectors = bb_read_len; + update_read_sectors(conf, bb_disk, this_sector, bb_read_len); + } + + return bb_disk; +} + +static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio) +{ + /* TODO: address issues with this check and concurrency. */ + return conf->mirrors[disk].next_seq_sect == r1_bio->sector || + conf->mirrors[disk].head_position == r1_bio->sector; +} + +/* + * If buffered sequential IO size exceeds optimal iosize, check if there is idle + * disk. If yes, choose the idle disk. + */ +static bool should_choose_next(struct r1conf *conf, int disk) +{ + struct raid1_info *mirror = &conf->mirrors[disk]; + int opt_iosize; + + if (!test_bit(Nonrot, &mirror->rdev->flags)) + return false; + + opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9; + return opt_iosize > 0 && mirror->seq_start != MaxSector && + mirror->next_seq_sect > opt_iosize && + mirror->next_seq_sect - opt_iosize >= mirror->seq_start; +} + +static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + if (!rdev || test_bit(Faulty, &rdev->flags)) + return false; + + if (rdev_in_recovery(rdev, r1_bio)) + return false; + + /* don't read from slow disk unless have to */ + if (test_bit(WriteMostly, &rdev->flags)) + return false; + + /* don't split IO for bad blocks unless have to */ + if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors)) + return false; + + return true; +} + +struct read_balance_ctl { + sector_t closest_dist; + int closest_dist_disk; + int min_pending; + int min_pending_disk; + int sequential_disk; + int readable_disks; +}; + +static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio) +{ + int disk; + struct read_balance_ctl ctl = { + .closest_dist_disk = -1, + .closest_dist = MaxSector, + .min_pending_disk = -1, + .min_pending = UINT_MAX, + .sequential_disk = -1, + }; + + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + sector_t dist; + unsigned int pending; + + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; - if (best_disk >= 0) - /* At least two disks to choose from so failfast is OK */ + rdev = conf->mirrors[disk].rdev; + if (!rdev_readable(rdev, r1_bio)) + continue; + + /* At least two disks to choose from so failfast is OK */ + if (ctl.readable_disks++ == 1) set_bit(R1BIO_FailFast, &r1_bio->state); - nonrot = bdev_nonrot(rdev->bdev); - has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); - dist = abs(this_sector - conf->mirrors[disk].head_position); - if (choose_first) { - best_disk = disk; - break; - } + dist = abs(r1_bio->sector - conf->mirrors[disk].head_position); + /* Don't change to another disk for sequential reads */ - if (conf->mirrors[disk].next_seq_sect == this_sector - || dist == 0) { - int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; - struct raid1_info *mirror = &conf->mirrors[disk]; + if (is_sequential(conf, disk, r1_bio)) { + if (!should_choose_next(conf, disk)) + return disk; - best_disk = disk; /* - * If buffered sequential IO size exceeds optimal - * iosize, check if there is idle disk. If yes, choose - * the idle disk. read_balance could already choose an - * idle disk before noticing it's a sequential IO in - * this disk. This doesn't matter because this disk - * will idle, next time it will be utilized after the - * first disk has IO size exceeds optimal iosize. In - * this way, iosize of the first disk will be optimal - * iosize at least. iosize of the second disk might be - * small, but not a big deal since when the second disk - * starts IO, the first disk is likely still busy. + * Add 'pending' to avoid choosing this disk if + * there is other idle disk. */ - if (nonrot && opt_iosize > 0 && - mirror->seq_start != MaxSector && - mirror->next_seq_sect > opt_iosize && - mirror->next_seq_sect - opt_iosize >= - mirror->seq_start) { - choose_next_idle = 1; - continue; - } - break; + pending++; + /* + * If there is no other idle disk, this disk + * will be chosen. + */ + ctl.sequential_disk = disk; } - if (choose_next_idle) - continue; - - if (min_pending > pending) { - min_pending = pending; - best_pending_disk = disk; + if (ctl.min_pending > pending) { + ctl.min_pending = pending; + ctl.min_pending_disk = disk; } - if (dist < best_dist) { - best_dist = dist; - best_dist_disk = disk; + if (ctl.closest_dist > dist) { + ctl.closest_dist = dist; + ctl.closest_dist_disk = disk; } } /* + * sequential IO size exceeds optimal iosize, however, there is no other + * idle disk, so choose the sequential disk. + */ + if (ctl.sequential_disk != -1 && ctl.min_pending != 0) + return ctl.sequential_disk; + + /* * If all disks are rotational, choose the closest disk. If any disk is * non-rotational, choose the disk with less pending request even the * disk is rotational, which might/might not be optimal for raids with * mixed ratation/non-rotational disks depending on workload. */ - if (best_disk == -1) { - if (has_nonrot_disk || min_pending == 0) - best_disk = best_pending_disk; - else - best_disk = best_dist_disk; - } + if (ctl.min_pending_disk != -1 && + (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0)) + return ctl.min_pending_disk; + else + return ctl.closest_dist_disk; +} - if (best_disk >= 0) { - rdev = conf->mirrors[best_disk].rdev; - if (!rdev) - goto retry; - atomic_inc(&rdev->nr_pending); - sectors = best_good_sectors; +/* + * This routine returns the disk from which the requested read should be done. + * + * 1) If resync is in progress, find the first usable disk and use it even if it + * has some bad blocks. + * + * 2) Now that there is no resync, loop through all disks and skipping slow + * disks and disks with bad blocks for now. Only pay attention to key disk + * choice. + * + * 3) If we've made it this far, now look for disks with bad blocks and choose + * the one with most number of sectors. + * + * 4) If we are all the way at the end, we have no choice but to use a disk even + * if it is write mostly. + * + * The rdev for the device selected will have nr_pending incremented. + */ +static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + int disk; + + clear_bit(R1BIO_FailFast, &r1_bio->state); - if (conf->mirrors[best_disk].next_seq_sect != this_sector) - conf->mirrors[best_disk].seq_start = this_sector; + if (raid1_should_read_first(conf->mddev, r1_bio->sector, + r1_bio->sectors)) + return choose_first_rdev(conf, r1_bio, max_sectors); - conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; + disk = choose_best_rdev(conf, r1_bio); + if (disk >= 0) { + *max_sectors = r1_bio->sectors; + update_read_sectors(conf, disk, r1_bio->sector, + r1_bio->sectors); + return disk; } - *max_sectors = sectors; - return best_disk; + /* + * If we are here it means we didn't find a perfectly good disk so + * now spend a bit more time trying to find one with the most good + * sectors. + */ + disk = choose_bb_rdev(conf, r1_bio, max_sectors); + if (disk >= 0) + return disk; + + return choose_slow_rdev(conf, r1_bio, max_sectors); } static void wake_up_barrier(struct r1conf *conf) @@ -798,7 +898,7 @@ static void wake_up_barrier(struct r1conf *conf) static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - raid1_prepare_flush_writes(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev); wake_up_barrier(conf); while (bio) { /* submit pending writes */ @@ -1098,7 +1198,7 @@ static void freeze_array(struct r1conf *conf, int extra) */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 1; - raid1_log(conf->mddev, "wait freeze"); + mddev_add_trace_msg(conf->mddev, "raid1 wait freeze"); wait_event_lock_irq_cmd( conf->wait_barrier, get_unqueued_pending(conf) == extra, @@ -1215,13 +1315,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct r1conf *conf = mddev->private; struct raid1_info *mirror; struct bio *read_bio; - struct bitmap *bitmap = mddev->bitmap; const enum req_op op = bio_op(bio); const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; - int rdisk; + int rdisk, error; bool r1bio_existed = !!r1_bio; - char b[BDEVNAME_SIZE]; /* * If r1_bio is set, we are blocking the raid1d thread @@ -1230,16 +1328,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, */ gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO; - if (r1bio_existed) { - /* Need to get the block device name carefully */ - struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; - - if (rdev) - snprintf(b, sizeof(b), "%pg", rdev->bdev); - else - strcpy(b, "???"); - } - /* * Still need barrier for READ in case that whole * array is frozen. @@ -1261,15 +1349,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, * used and no empty request is available. */ rdisk = read_balance(conf, r1_bio, &max_sectors); - if (rdisk < 0) { /* couldn't find anywhere to read from */ - if (r1bio_existed) { - pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", + if (r1bio_existed) + pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n", mdname(mddev), - b, - (unsigned long long)r1_bio->sector); - } + conf->mirrors[r1_bio->read_disk].rdev->bdev, + r1_bio->sector); raid_end_bio_io(r1_bio); return; } @@ -1281,20 +1367,23 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, (unsigned long long)r1_bio->sector, mirror->rdev->bdev); - if (test_bit(WriteMostly, &mirror->rdev->flags) && - bitmap) { + if (test_bit(WriteMostly, &mirror->rdev->flags)) { /* * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' */ - raid1_log(mddev, "wait behind writes"); - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); + mddev_add_trace_msg(mddev, "raid1 wait behind writes"); + mddev->bitmap_ops->wait_behind_writes(mddev); } if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, gfp, &conf->bio_split); + + if (IS_ERR(split)) { + error = PTR_ERR(split); + goto err_handle; + } bio_chain(split, bio); submit_bio_noacct(bio); bio = split; @@ -1320,12 +1409,49 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, test_bit(R1BIO_FailFast, &r1_bio->state)) read_bio->bi_opf |= MD_FAILFAST; read_bio->bi_private = r1_bio; + mddev_trace_remap(mddev, read_bio, r1_bio->sector); + submit_bio_noacct(read_bio); + return; - if (mddev->gendisk) - trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), - r1_bio->sector); +err_handle: + atomic_dec(&mirror->rdev->nr_pending); + bio->bi_status = errno_to_blk_status(error); + set_bit(R1BIO_Uptodate, &r1_bio->state); + raid_end_bio_io(r1_bio); +} - submit_bio_noacct(read_bio); +static bool wait_blocked_rdev(struct mddev *mddev, struct bio *bio) +{ + struct r1conf *conf = mddev->private; + int disks = conf->raid_disks * 2; + int i; + +retry: + for (i = 0; i < disks; i++) { + struct md_rdev *rdev = conf->mirrors[i].rdev; + + if (!rdev) + continue; + + /* don't write here until the bad block is acknowledged */ + if (test_bit(WriteErrorSeen, &rdev->flags) && + rdev_has_badblock(rdev, bio->bi_iter.bi_sector, + bio_sectors(bio)) < 0) + set_bit(BlockedBadBlocks, &rdev->flags); + + if (rdev_blocked(rdev)) { + if (bio->bi_opf & REQ_NOWAIT) + return false; + + mddev_add_trace_msg(rdev->mddev, "raid1 wait rdev %d blocked", + rdev->raid_disk); + atomic_inc(&rdev->nr_pending); + md_wait_for_blocked_rdev(rdev, rdev->mddev); + goto retry; + } + } + + return true; } static void raid1_write_request(struct mddev *mddev, struct bio *bio, @@ -1333,10 +1459,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - int i, disks; - struct bitmap *bitmap = mddev->bitmap; + int i, disks, k, error; unsigned long flags; - struct md_rdev *blocked_rdev; int first_clone; int max_sectors; bool write_behind = false; @@ -1374,7 +1498,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, return; } - retry_write: + if (!wait_blocked_rdev(mddev, bio)) { + bio_wouldblock_error(bio); + return; + } + r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; @@ -1390,7 +1518,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ disks = conf->raid_disks * 2; - blocked_rdev = NULL; max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { struct md_rdev *rdev = conf->mirrors[i].rdev; @@ -1403,17 +1530,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags)) write_behind = true; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } r1_bio->bios[i] = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { - if (i < conf->raid_disks) - set_bit(R1BIO_Degraded, &r1_bio->state); + if (!rdev || test_bit(Faulty, &rdev->flags)) continue; - } atomic_inc(&rdev->nr_pending); if (test_bit(WriteErrorSeen, &rdev->flags)) { @@ -1423,13 +1542,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, &first_bad, &bad_sectors); - if (is_bad < 0) { - /* mustn't write here until the bad block is - * acknowledged*/ - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } if (is_bad && first_bad <= r1_bio->sector) { /* Cannot write here at all */ bad_sectors -= (r1_bio->sector - first_bad); @@ -1439,20 +1551,24 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ max_sectors = bad_sectors; rdev_dec_pending(rdev, mddev); - /* We don't set R1BIO_Degraded as that - * only applies if the disk is - * missing, so it might be re-added, - * and we want to know to recover this - * chunk. - * In this case the device is here, - * and the fact that this chunk is not - * in-sync is recorded in the bad - * block log - */ continue; } if (is_bad) { - int good_sectors = first_bad - r1_bio->sector; + int good_sectors; + + /* + * We cannot atomically write this, so just + * error in that case. It could be possible to + * atomically write other mirrors, but the + * complexity of supporting that is not worth + * the benefit. + */ + if (bio->bi_opf & REQ_ATOMIC) { + error = -EIO; + goto err_handle; + } + + good_sectors = first_bad - r1_bio->sector; if (good_sectors < max_sectors) max_sectors = good_sectors; } @@ -1460,38 +1576,23 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->bios[i] = bio; } - if (unlikely(blocked_rdev)) { - /* Wait for this device to become unblocked */ - int j; - - for (j = 0; j < i; j++) - if (r1_bio->bios[j]) - rdev_dec_pending(conf->mirrors[j].rdev, mddev); - free_r1bio(r1_bio); - allow_barrier(conf, bio->bi_iter.bi_sector); - - if (bio->bi_opf & REQ_NOWAIT) { - bio_wouldblock_error(bio); - return; - } - raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); - md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf, bio->bi_iter.bi_sector, false); - goto retry_write; - } - /* * When using a bitmap, we may call alloc_behind_master_bio below. * alloc_behind_master_bio allocates a copy of the data payload a page * at a time and thus needs a new bio that can fit the whole payload * this bio in page sized chunks. */ - if (write_behind && bitmap) + if (write_behind && mddev->bitmap) max_sectors = min_t(int, max_sectors, BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, GFP_NOIO, &conf->bio_split); + + if (IS_ERR(split)) { + error = PTR_ERR(split); + goto err_handle; + } bio_chain(split, bio); submit_bio_noacct(bio); bio = split; @@ -1513,19 +1614,22 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, continue; if (first_clone) { + unsigned long max_write_behind = + mddev->bitmap_info.max_write_behind; + struct md_bitmap_stats stats; + int err; + /* do behind I/O ? * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly * is waiting for behind writes to flush */ - if (bitmap && write_behind && - (atomic_read(&bitmap->behind_writes) - < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) { + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (!err && write_behind && !stats.behind_wait && + stats.behind_writes < max_write_behind) alloc_behind_master_bio(r1_bio, bio); - } - md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, - test_bit(R1BIO_BehindIO, &r1_bio->state)); + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + mddev->bitmap_ops->start_behind_write(mddev); first_clone = 0; } @@ -1549,7 +1653,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset); mbio->bi_end_io = raid1_end_write_request; - mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); + mbio->bi_opf = bio_op(bio) | + (bio->bi_opf & (REQ_SYNC | REQ_FUA | REQ_ATOMIC)); if (test_bit(FailFast, &rdev->flags) && !test_bit(WriteMostly, &rdev->flags) && conf->raid_disks - mddev->degraded > 1) @@ -1557,10 +1662,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); - - if (mddev->gendisk) - trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), - r1_bio->sector); + mddev_trace_remap(mddev, mbio, r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_bdev = (void *)rdev; if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) { @@ -1575,6 +1677,18 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, /* In case raid1d snuck in to freeze_array */ wake_up_barrier(conf); + return; +err_handle: + for (k = 0; k < i; k++) { + if (r1_bio->bios[k]) { + rdev_dec_pending(conf->mirrors[k].rdev, mddev); + r1_bio->bios[k] = NULL; + } + } + + bio->bi_status = errno_to_blk_status(error); + set_bit(R1BIO_Uptodate, &r1_bio->state); + raid_end_bio_io(r1_bio); } static bool raid1_make_request(struct mddev *mddev, struct bio *bio) @@ -1598,8 +1712,7 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio) if (bio_data_dir(bio) == READ) raid1_read_request(mddev, bio, sectors, NULL); else { - if (!md_write_start(mddev,bio)) - return false; + md_write_start(mddev,bio); raid1_write_request(mddev, bio, sectors); } return true; @@ -1760,6 +1873,52 @@ static int raid1_spare_active(struct mddev *mddev) return count; } +static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk, + bool replacement) +{ + struct raid1_info *info = conf->mirrors + disk; + + if (replacement) + info += conf->raid_disks; + + if (info->rdev) + return false; + + if (bdev_nonrot(rdev->bdev)) { + set_bit(Nonrot, &rdev->flags); + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1); + } + + rdev->raid_disk = disk; + info->head_position = 0; + info->seq_start = MaxSector; + WRITE_ONCE(info->rdev, rdev); + + return true; +} + +static bool raid1_remove_conf(struct r1conf *conf, int disk) +{ + struct raid1_info *info = conf->mirrors + disk; + struct md_rdev *rdev = info->rdev; + + if (!rdev || test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) + return false; + + /* Only remove non-faulty devices if recovery is not possible. */ + if (!test_bit(Faulty, &rdev->flags) && + rdev->mddev->recovery_disabled != conf->recovery_disabled && + rdev->mddev->degraded < conf->raid_disks) + return false; + + if (test_and_clear_bit(Nonrot, &rdev->flags)) + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1); + + WRITE_ONCE(info->rdev, NULL); + return true; +} + static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r1conf *conf = mddev->private; @@ -1772,9 +1931,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; - if (md_integrity_add_rdev(rdev, mddev)) - return -ENXIO; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -1791,19 +1947,16 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) for (mirror = first; mirror <= last; mirror++) { p = conf->mirrors + mirror; if (!p->rdev) { - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + err = mddev_stack_new_rdev(mddev, rdev); + if (err) + return err; - p->head_position = 0; - rdev->raid_disk = mirror; - err = 0; + raid1_add_conf(conf, rdev, mirror, false); /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ if (rdev->saved_raid_disk < 0) conf->fullsync = 1; - WRITE_ONCE(p->rdev, rdev); break; } if (test_bit(WantReplacement, &p->rdev->flags) && @@ -1813,13 +1966,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (err && repl_slot >= 0) { /* Add this device as a replacement */ - p = conf->mirrors + repl_slot; clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); - rdev->raid_disk = repl_slot; + raid1_add_conf(conf, rdev, repl_slot, true); err = 0; conf->fullsync = 1; - WRITE_ONCE(p[conf->raid_disks].rdev, rdev); } print_conf(conf); @@ -1836,27 +1987,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (unlikely(number >= conf->raid_disks)) goto abort; - if (rdev != p->rdev) - p = conf->mirrors + conf->raid_disks + number; + if (rdev != p->rdev) { + number += conf->raid_disks; + p = conf->mirrors + number; + } print_conf(conf); if (rdev == p->rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { + if (!raid1_remove_conf(conf, number)) { err = -EBUSY; goto abort; } - /* Only remove non-faulty devices if recovery - * is not possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != conf->recovery_disabled && - mddev->degraded < conf->raid_disks) { - err = -EBUSY; - goto abort; - } - WRITE_ONCE(p->rdev, NULL); - if (conf->mirrors[conf->raid_disks + number].rdev) { + + if (number < conf->raid_disks && + conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before * doing this to avoid confusion. @@ -1916,7 +2060,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) /* make sure these bits don't get cleared. */ do { - md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); + mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks); s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); @@ -1944,8 +2088,6 @@ static void end_sync_write(struct bio *bio) struct r1bio *r1_bio = get_resync_r1bio(bio); struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; - sector_t first_bad; - int bad_sectors; struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; if (!uptodate) { @@ -1955,14 +2097,11 @@ static void end_sync_write(struct bio *bio) set_bit(MD_RECOVERY_NEEDED, & mddev->recovery); set_bit(R1BIO_WriteError, &r1_bio->state); - } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, - &first_bad, &bad_sectors) && - !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, - r1_bio->sector, - r1_bio->sectors, - &first_bad, &bad_sectors) - ) + } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && + !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev, + r1_bio->sector, r1_bio->sectors)) { set_bit(R1BIO_MadeGood, &r1_bio->state); + } put_sync_write_buf(r1_bio, uptodate); } @@ -2279,16 +2418,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) s = PAGE_SIZE >> 9; do { - sector_t first_bad; - int bad_sectors; - rdev = conf->mirrors[d].rdev; if (rdev && (test_bit(In_sync, &rdev->flags) || (!test_bit(Faulty, &rdev->flags) && rdev->recovery_offset >= sect + s)) && - is_badblock(rdev, sect, s, - &first_bad, &bad_sectors) == 0) { + rdev_has_badblock(rdev, sect, s) == 0) { atomic_inc(&rdev->nr_pending); if (sync_page_io(rdev, sect, s<<9, conf->tmppage, REQ_OP_READ, false)) @@ -2461,12 +2596,10 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) * errors. */ fail = true; - if (!narrow_write_error(r1_bio, m)) { + if (!narrow_write_error(r1_bio, m)) md_error(conf->mddev, conf->mirrors[m].rdev); /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } rdev_dec_pending(conf->mirrors[m].rdev, conf->mddev); } @@ -2557,8 +2690,6 @@ static void raid1d(struct md_thread *thread) list_del(&r1_bio->retry_list); idx = sector_to_idx(r1_bio->sector); atomic_dec(&conf->nr_queued[idx]); - if (mddev->degraded) - set_bit(R1BIO_Degraded, &r1_bio->state); if (test_bit(R1BIO_WriteError, &r1_bio->state)) close_write(r1_bio); raid_end_bio_io(r1_bio); @@ -2643,18 +2774,18 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) */ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped) + sector_t max_sector, int *skipped) { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; struct bio *bio; - sector_t max_sector, nr_sectors; + sector_t nr_sectors; int disk = -1; int i; int wonly = -1; int write_targets = 0, read_targets = 0; sector_t sync_blocks; - int still_degraded = 0; + bool still_degraded = false; int good_sectors = RESYNC_SECTORS; int min_bad = 0; /* number of sectors that are bad in all devices */ int idx = sector_to_idx(sector_nr); @@ -2664,7 +2795,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (init_resync(conf)) return 0; - max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the * sync on the 'current' bitmap chunk (there will @@ -2672,12 +2802,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * We can find the current addess in mddev->curr_resync */ if (mddev->curr_resync < max_sector) /* aborted */ - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - md_bitmap_close_sync(mddev->bitmap); + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); if (mddev_is_clustered(mddev)) { @@ -2697,7 +2827,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ *skipped = 1; @@ -2715,9 +2845,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * sector_nr + two times RESYNC_SECTORS */ - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, - mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); if (raise_barrier(conf, sector_nr)) return 0; @@ -2748,7 +2878,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { if (i < conf->raid_disks) - still_degraded = 1; + still_degraded = true; } else if (!test_bit(In_sync, &rdev->flags)) { bio->bi_opf = REQ_OP_WRITE; bio->bi_end_io = end_sync_write; @@ -2872,8 +3002,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; if (sync_blocks == 0) { - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, - &sync_blocks, still_degraded) && + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, + &sync_blocks, still_degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; @@ -3006,23 +3136,17 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -EINVAL; spin_lock_init(&conf->device_lock); + conf->raid_disks = mddev->raid_disks; rdev_for_each(rdev, mddev) { int disk_idx = rdev->raid_disk; - if (disk_idx >= mddev->raid_disks - || disk_idx < 0) + + if (disk_idx >= conf->raid_disks || disk_idx < 0) continue; - if (test_bit(Replacement, &rdev->flags)) - disk = conf->mirrors + mddev->raid_disks + disk_idx; - else - disk = conf->mirrors + disk_idx; - if (disk->rdev) + if (!raid1_add_conf(conf, rdev, disk_idx, + test_bit(Replacement, &rdev->flags))) goto abort; - disk->rdev = rdev; - disk->head_position = 0; - disk->seq_start = MaxSector; } - conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->bio_end_io_list); @@ -3086,12 +3210,24 @@ static struct r1conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } -static void raid1_free(struct mddev *mddev, void *priv); +static int raid1_set_limits(struct mddev *mddev) +{ + struct queue_limits lim; + int err; + + md_init_stacking_limits(&lim); + lim.max_write_zeroes_sectors = 0; + lim.features |= BLK_FEAT_ATOMIC_WRITES; + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) + return err; + return queue_limits_set(mddev->gendisk->queue, &lim); +} + static int raid1_run(struct mddev *mddev) { struct r1conf *conf; int i; - struct md_rdev *rdev; int ret; if (mddev->level != 1) { @@ -3118,14 +3254,10 @@ static int raid1_run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - if (mddev->queue) - blk_queue_max_write_zeroes_sectors(mddev->queue, 0); - - rdev_for_each(rdev, mddev) { - if (!mddev->gendisk) - continue; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (!mddev_is_dm(mddev)) { + ret = raid1_set_limits(mddev); + if (ret) + return ret; } mddev->degraded = 0; @@ -3139,8 +3271,7 @@ static int raid1_run(struct mddev *mddev) */ if (conf->raid_disks - mddev->degraded < 1) { md_unregister_thread(mddev, &conf->thread); - ret = -EINVAL; - goto abort; + return -EINVAL; } if (conf->raid_disks - mddev->degraded == 1) @@ -3164,14 +3295,8 @@ static int raid1_run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); ret = md_integrity_register(mddev); - if (ret) { + if (ret) md_unregister_thread(mddev, &mddev->thread); - goto abort; - } - return 0; - -abort: - raid1_free(mddev, conf); return ret; } @@ -3201,14 +3326,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize = raid1_size(mddev, sectors, 0); + int ret; + if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - if (mddev->bitmap) { - int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); - if (ret) - return ret; - } + + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + if (ret) + return ret; + md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { |