From 483cbbeddd5fe2c80fd4141ff0748fa06c4ff146 Mon Sep 17 00:00:00 2001 From: Alexei Naberezhnov Date: Tue, 27 Mar 2018 16:54:16 -0700 Subject: md/raid5: fix 'out of memory' during raid cache recovery This fixes the case when md array assembly fails because of raid cache recovery unable to allocate a stripe, despite attempts to replay stripes and increase cache size. This happens because stripes released by r5c_recovery_replay_stripes and raid5_set_cache_size don't become available for allocation immediately. Released stripes first are placed on conf->released_stripes list and require md thread to merge them on conf->inactive_list before they can be allocated. Patch allows final allocation attempt during cache recovery to wait for new stripes to become availabe for allocation. Cc: linux-raid@vger.kernel.org Cc: Shaohua Li Cc: linux-stable # 4.10+ Fixes: b4c625c67362 ("md/r5cache: r5cache recovery: part 1") Signed-off-by: Alexei Naberezhnov Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 33 ++++++++++++++++++++++----------- drivers/md/raid5.c | 8 ++++++-- 2 files changed, 28 insertions(+), 13 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index ec3a5ef7fee0..cbbe6b6535be 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1935,12 +1935,14 @@ out: } static struct stripe_head * -r5c_recovery_alloc_stripe(struct r5conf *conf, - sector_t stripe_sect) +r5c_recovery_alloc_stripe( + struct r5conf *conf, + sector_t stripe_sect, + int noblock) { struct stripe_head *sh; - sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); + sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0); if (!sh) return NULL; /* no more stripe available */ @@ -2150,7 +2152,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, stripe_sect); if (!sh) { - sh = r5c_recovery_alloc_stripe(conf, stripe_sect); + sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1); /* * cannot get stripe from raid5_get_active_stripe * try replay some stripes @@ -2159,20 +2161,29 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, r5c_recovery_replay_stripes( cached_stripe_list, ctx); sh = r5c_recovery_alloc_stripe( - conf, stripe_sect); + conf, stripe_sect, 1); } if (!sh) { + int new_size = conf->min_nr_stripes * 2; pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", mdname(mddev), - conf->min_nr_stripes * 2); - raid5_set_cache_size(mddev, - conf->min_nr_stripes * 2); - sh = r5c_recovery_alloc_stripe(conf, - stripe_sect); + new_size); + ret = raid5_set_cache_size(mddev, new_size); + if (conf->min_nr_stripes <= new_size / 2) { + pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n", + mdname(mddev), + ret, + new_size, + conf->min_nr_stripes, + conf->max_nr_stripes); + return -ENOMEM; + } + sh = r5c_recovery_alloc_stripe( + conf, stripe_sect, 0); } if (!sh) { pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", - mdname(mddev)); + mdname(mddev)); return -ENOMEM; } list_add_tail(&sh->lru, cached_stripe_list); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4990f0319f6c..cecea901ab8c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6369,6 +6369,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page) int raid5_set_cache_size(struct mddev *mddev, int size) { + int result = 0; struct r5conf *conf = mddev->private; if (size <= 16 || size > 32768) @@ -6385,11 +6386,14 @@ raid5_set_cache_size(struct mddev *mddev, int size) mutex_lock(&conf->cache_size_mutex); while (size > conf->max_nr_stripes) - if (!grow_one_stripe(conf, GFP_KERNEL)) + if (!grow_one_stripe(conf, GFP_KERNEL)) { + conf->min_nr_stripes = conf->max_nr_stripes; + result = -ENOMEM; break; + } mutex_unlock(&conf->cache_size_mutex); - return 0; + return result; } EXPORT_SYMBOL(raid5_set_cache_size); -- cgit From 645efa84f6c7566ea863ed37a8b3247247f72e02 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 5 Feb 2019 05:09:00 -0500 Subject: dm: add memory barrier before waitqueue_active Block core changes to switch bio-based IO accounting to be percpu had a side-effect of altering DM core to now rely on calling waitqueue_active (in both bio-based and request-based) to check if another task is in dm_wait_for_completion(). A memory barrier is needed before calling waitqueue_active(). DM core doesn't piggyback on a preceding memory barrier so it must explicitly use its own. For more details on why using waitqueue_active() without a preceding barrier is unsafe, please see the comment before the waitqueue_active() definition in include/linux/wait.h. Add the missing memory barrier by switching to using wq_has_sleeper(). Fixes: 6f75723190d8 ("dm: remove the pending IO accounting") Fixes: c4576aed8d85 ("dm: fix request-based dm's use of dm_wait_for_completion") Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-rq.c | 2 +- drivers/md/dm.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 4eb5f8c56535..a20531e5f3b4 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -131,7 +131,7 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) static void rq_completed(struct mapped_device *md) { /* nudge anyone waiting on suspend queue */ - if (unlikely(waitqueue_active(&md->wait))) + if (unlikely(wq_has_sleeper(&md->wait))) wake_up(&md->wait); /* diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2b53c3841b53..a0972a9301de 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -699,7 +699,7 @@ static void end_io_acct(struct dm_io *io) true, duration, &io->stats_aux); /* nudge anyone waiting on suspend queue */ - if (unlikely(waitqueue_active(&md->wait))) + if (unlikely(wq_has_sleeper(&md->wait))) wake_up(&md->wait); } -- cgit From fa8db4948f5224dae33a0e783e7dec682e145f88 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 5 Feb 2019 17:07:58 -0500 Subject: dm: don't use bio_trim() afterall bio_trim() has an early return, which makes it _not_ idempotent, if the offset is 0 and the bio's bi_size already matches the requested size. Prior to DM, all users of bio_trim() were fine with this. But DM has exposed the fact that bio_trim()'s early return is incompatible with a cloned bio whose integrity payload must be trimmed via bio_integrity_trim(). Fix this by reverting DM back to doing the equivalent of bio_trim() but in an idempotent manner (so bio_integrity_trim is always performed). Follow-on work is needed to assess what benefit bio_trim()'s early return is providing to its existing callers. Reported-by: Milan Broz Fixes: 57c36519e4b94 ("dm: fix clone_bio() to trigger blk_recount_segments()") Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a0972a9301de..515e6af9bed2 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1336,7 +1336,11 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, return r; } - bio_trim(clone, sector - clone->bi_iter.bi_sector, len); + bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); + clone->bi_iter.bi_size = to_bytes(len); + + if (bio_integrity(bio)) + bio_integrity_trim(clone); return 0; } -- cgit