From 820455238366a78a44a85cc7d58a987e728464d9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 22 Jun 2023 00:51:07 +0800 Subject: md/raid10: switch to use md_account_bio() for io accounting Make sure that 'active_io' will represent inflight io instead of io that is dispatching, and io accounting from all levels will be consistent. Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621165110.1498313-6-yukuai1@huaweicloud.com --- drivers/md/raid10.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5051149e27bb..d42e9b7d2608 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_status = BLK_STS_IOERR; - if (r10_bio->start_time) - bio_end_io_acct(bio, r10_bio->start_time); bio_endio(bio); /* * Wake up any possible resync thread that waits for the device @@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, } static void raid10_read_request(struct mddev *mddev, struct bio *bio, - struct r10bio *r10_bio) + struct r10bio *r10_bio, bool io_accounting) { struct r10conf *conf = mddev->private; struct bio *read_bio; @@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } slot = r10_bio->read_slot; - if (!r10_bio->start_time && - blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + if (io_accounting) { + md_account_bio(mddev, &bio); + r10_bio->master_bio = bio; + } read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); r10_bio->devs[slot].bio = read_bio; @@ -1543,8 +1542,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->master_bio = bio; } - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + md_account_bio(mddev, &bio); + r10_bio->master_bio = bio; atomic_set(&r10_bio->remaining, 1); md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); @@ -1571,12 +1570,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; r10_bio->read_slot = -1; - r10_bio->start_time = 0; memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks); if (bio_data_dir(bio) == READ) - raid10_read_request(mddev, bio, r10_bio); + raid10_read_request(mddev, bio, r10_bio, true); else raid10_write_request(mddev, bio, r10_bio); } @@ -2985,7 +2983,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) rdev_dec_pending(rdev, mddev); r10_bio->state = 0; - raid10_read_request(mddev, r10_bio->master_bio, r10_bio); + raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false); /* * allow_barrier after re-submit to ensure no sync io * can be issued while regular io pending. -- cgit From 605eeda6e70f692311b36180f217208d367476f6 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 24 Jun 2023 01:32:34 +0800 Subject: md/raid10: optimize fix_read_error We dereference r10_bio->read_slot too many times in fix_read_error(). Optimize it by using a variable to store read_slot. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230623173236.2513554-2-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d42e9b7d2608..abea91a54db1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2718,10 +2718,10 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) { int sect = 0; /* Offset from r10_bio->sector */ - int sectors = r10_bio->sectors; + int sectors = r10_bio->sectors, slot = r10_bio->read_slot; struct md_rdev *rdev; int max_read_errors = atomic_read(&mddev->max_corr_read_errors); - int d = r10_bio->devs[r10_bio->read_slot].devnum; + int d = r10_bio->devs[slot].devnum; /* still own a reference to this rdev, so it cannot * have been cleared recently. @@ -2742,13 +2742,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 pr_notice("md/raid10:%s: %pg: Failing raid device\n", mdname(mddev), rdev->bdev); md_error(mddev, rdev); - r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; + r10_bio->devs[slot].bio = IO_BLOCKED; return; } while(sectors) { int s = sectors; - int sl = r10_bio->read_slot; + int sl = slot; int success = 0; int start; @@ -2783,7 +2783,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 sl++; if (sl == conf->copies) sl = 0; - } while (!success && sl != r10_bio->read_slot); + } while (!success && sl != slot); rcu_read_unlock(); if (!success) { @@ -2791,16 +2791,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 * as bad on the first device to discourage future * reads. */ - int dn = r10_bio->devs[r10_bio->read_slot].devnum; + int dn = r10_bio->devs[slot].devnum; rdev = conf->mirrors[dn].rdev; if (!rdev_set_badblocks( rdev, - r10_bio->devs[r10_bio->read_slot].addr + r10_bio->devs[slot].addr + sect, s, 0)) { md_error(mddev, rdev); - r10_bio->devs[r10_bio->read_slot].bio + r10_bio->devs[slot].bio = IO_BLOCKED; } break; @@ -2809,7 +2809,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 start = sl; /* write it back and re-read */ rcu_read_lock(); - while (sl != r10_bio->read_slot) { + while (sl != slot) { if (sl==0) sl = conf->copies; sl--; @@ -2843,7 +2843,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 rcu_read_lock(); } sl = start; - while (sl != r10_bio->read_slot) { + while (sl != slot) { if (sl==0) sl = conf->copies; sl--; -- cgit From 02c67a3b72b13951c2ca134bd7065f03ec57946d Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 24 Jun 2023 01:32:35 +0800 Subject: md: remove redundant check in fix_read_error() In fix_read_error(), 'success' will be checked immediately after assigning it, if it is set to 1 then the loop will break. Checking it again in condition of loop is redundant. Clean it up. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230623173236.2513554-3-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index abea91a54db1..757687fb90a7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2783,7 +2783,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 sl++; if (sl == conf->copies) sl = 0; - } while (!success && sl != slot); + } while (sl != slot); rcu_read_unlock(); if (!success) { -- cgit From 7e85c41b9e1df9192c225afd7cfec8dcad137feb Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 1 Jul 2023 16:05:27 +0800 Subject: md/raid10: check replacement and rdev to prevent submit the same io twice After commit 4ca40c2ce099 ("md/raid10: Allow replacement device to be replace old drive."), 'rdev' and 'replacement' could appear to be identical. There are already checks for that in wait_blocked_dev() and raid10_write_request(). Add check for raid10_handle_discard() now. Signed-off-by: Li Nan Link: https://lore.kernel.org/r/20230701080529.2684932-2-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 757687fb90a7..60963449d3f5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1785,6 +1785,8 @@ retry_discard: r10_bio->devs[disk].bio = NULL; r10_bio->devs[disk].repl_bio = NULL; + if (rdev == rrdev) + rrdev = NULL; if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) -- cgit From b99f8fd2d91eb734f13098aa1cf337edaca454b7 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 1 Jul 2023 16:05:28 +0800 Subject: md/raid10: factor out dereference_rdev_and_rrdev() Factor out a helper to get 'rdev' and 'replacement' from config->mirrors. Just to make code cleaner and prepare to fix the bug of io loss while 'replacement' replace 'rdev'. There is no functional change. Signed-off-by: Li Nan Link: https://lore.kernel.org/r/20230701080529.2684932-3-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 60963449d3f5..d21aeb9546d9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1321,6 +1321,25 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } +static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror, + struct md_rdev **prrdev) +{ + struct md_rdev *rdev, *rrdev; + + rrdev = rcu_dereference(mirror->replacement); + /* + * Read replacement first to prevent reading both rdev and + * replacement as NULL during replacement replace rdev. + */ + smp_mb(); + rdev = rcu_dereference(mirror->rdev); + if (rdev == rrdev) + rrdev = NULL; + + *prrdev = rrdev; + return rdev; +} + static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) { int i; @@ -1464,15 +1483,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int d = r10_bio->devs[i].devnum; struct md_rdev *rdev, *rrdev; - rrdev = rcu_dereference(conf->mirrors[d].replacement); - /* - * Read replacement first to prevent reading both rdev and - * replacement as NULL during replacement replace rdev. - */ - smp_mb(); - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev == rrdev) - rrdev = NULL; + rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev); if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) -- cgit From 673643490b9a0eb3b25633abe604f62b8f63dba1 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 1 Jul 2023 16:05:29 +0800 Subject: md/raid10: use dereference_rdev_and_rrdev() to get devices Commit 2ae6aaf76912 ("md/raid10: fix io loss while replacement replace rdev") reads replacement first to prevent io loss. However, there are same issue in wait_blocked_dev() and raid10_handle_discard(), too. Fix it by using dereference_rdev_and_rrdev() to get devices. Fixes: d30588b2731f ("md/raid10: improve raid10 discard request") Fixes: f2e7e269a752 ("md/raid10: pull the code that wait for blocked dev into one function") Signed-off-by: Li Nan Link: https://lore.kernel.org/r/20230701080529.2684932-4-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d21aeb9546d9..16aa9d735880 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1350,11 +1350,9 @@ retry_wait: blocked_rdev = NULL; rcu_read_lock(); for (i = 0; i < conf->copies; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[i].replacement); - if (rdev == rrdev) - rrdev = NULL; + struct md_rdev *rdev, *rrdev; + + rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev); if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; @@ -1789,15 +1787,12 @@ retry_discard: */ rcu_read_lock(); for (disk = 0; disk < geo->raid_disks; disk++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); + struct md_rdev *rdev, *rrdev; + rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev); r10_bio->devs[disk].bio = NULL; r10_bio->devs[disk].repl_bio = NULL; - if (rdev == rrdev) - rrdev = NULL; if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) -- cgit From 892da88d1cd93426e9c6d7717876ca705fe2b9fa Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 31 Jul 2023 10:28:00 +0800 Subject: md/raid10: fix a 'conf->barrier' leakage in raid10_takeover() After commit b39f35ebe86d ("md: don't quiesce in mddev_suspend()"), 'conf->barrier' will be leaked in the case that raid10 takeover raid0: level_store pers->takeover -> raid10_takeover raid10_takeover_raid0 WRITE_ONCE(conf->barrier, 1) mddev_suspend // still raid0 mddev->pers = pers // switch to raid10 mddev_resume // resume without suspend After the above commit, mddev_resume() will not decrease 'conf->barrier' that is set in raid10_takeover_raid0(). Fix this problem by not setting 'conf->barrier' in raid10_takeover_raid0(). By the way, this problem is found while I'm trying to make mddev_suspend/resume() to be independent from raid personalities. raid10 is the only personality to use reference count in the quiesce() callback and this problem is only related to raid10. Fixes: b39f35ebe86d ("md: don't quiesce in mddev_suspend()") Signed-off-by: Yu Kuai Reviewed-by: Paul Menzel Link: https://lore.kernel.org/r/20230731022800.1424902-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 16aa9d735880..7704a4c7f469 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4417,7 +4417,6 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) rdev->new_raid_disk = rdev->raid_disk * 2; rdev->sectors = size; } - WRITE_ONCE(conf->barrier, 1); } return conf; -- cgit From 7eb8ff02c1df279bf7f7f29b866beb655a9eebe9 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 3 Aug 2023 15:17:11 +0800 Subject: md: Hold mddev->reconfig_mutex when trying to get mddev->sync_thread Commit ba9d9f1a707f ("Revert "md: unlock mddev before reap sync_thread in action_store"") removed the scenario of calling md_unregister_thread() without holding mddev->reconfig_mutex, so add a lock holding check before acquiring mddev->sync_thread by passing mdev to md_unregister_thread(). Signed-off-by: Li Lingfeng Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230803071711.2546560-1-lilingfeng@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7704a4c7f469..023413120851 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4320,7 +4320,7 @@ static int raid10_run(struct mddev *mddev) return 0; out_free_conf: - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); raid10_free_conf(conf); mddev->private = NULL; out: -- cgit