From 0a005856d35936e351cc20ff3de04499bd7ffbfd Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Sep 2019 14:11:11 +0800 Subject: dm clone: Make __hash_find static drivers/md/dm-clone-target.c:594:34: warning: symbol '__hash_find' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Mike Snitzer --- drivers/md/dm-clone-target.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index cd6f9e9fc98e..4ca8f1977222 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -591,8 +591,8 @@ static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone, * * NOTE: Must be called with the bucket lock held */ -struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket, - unsigned long region_nr) +static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket, + unsigned long region_nr) { struct dm_clone_region_hydration *hd; -- cgit From a2f83e8b0c82c9500421a26c49eb198b25fcdea3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 2 Oct 2019 06:14:17 -0400 Subject: dm snapshot: introduce account_start_copy() and account_end_copy() This simple refactoring moves code for modifying the semaphore cow_count into separate functions to prepare for changes that will extend these methods to provide for a more sophisticated mechanism for COW throttling. Signed-off-by: Mikulas Patocka Reviewed-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index f150f5c5492b..da3bd1794ee0 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1512,6 +1512,16 @@ static void snapshot_dtr(struct dm_target *ti) kfree(s); } +static void account_start_copy(struct dm_snapshot *s) +{ + down(&s->cow_count); +} + +static void account_end_copy(struct dm_snapshot *s) +{ + up(&s->cow_count); +} + /* * Flush a list of buffers. */ @@ -1732,7 +1742,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) rb_link_node(&pe->out_of_order_node, parent, p); rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree); } - up(&s->cow_count); + account_end_copy(s); } /* @@ -1756,7 +1766,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) dest.count = src.count; /* Hand over to kcopyd */ - down(&s->cow_count); + account_start_copy(s); dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); } @@ -1776,7 +1786,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, pe->full_bio = bio; pe->full_bio_end_io = bio->bi_end_io; - down(&s->cow_count); + account_start_copy(s); callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, copy_callback, pe); @@ -1866,7 +1876,7 @@ static void zero_callback(int read_err, unsigned long write_err, void *context) struct bio *bio = context; struct dm_snapshot *s = bio->bi_private; - up(&s->cow_count); + account_end_copy(s); bio->bi_status = write_err ? BLK_STS_IOERR : 0; bio_endio(bio); } @@ -1880,7 +1890,7 @@ static void zero_exception(struct dm_snapshot *s, struct dm_exception *e, dest.sector = bio->bi_iter.bi_sector; dest.count = s->store->chunk_size; - down(&s->cow_count); + account_start_copy(s); WARN_ON_ONCE(bio->bi_private); bio->bi_private = s; dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio); -- cgit From b21555786f18cd77f2311ad89074533109ae3ffa Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 2 Oct 2019 06:15:53 -0400 Subject: dm snapshot: rework COW throttling to fix deadlock Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and workqueue stalls") introduced a semaphore to limit the maximum number of in-flight kcopyd (COW) jobs. The implementation of this throttling mechanism is prone to a deadlock: 1. One or more threads write to the origin device causing COW, which is performed by kcopyd. 2. At some point some of these threads might reach the s->cow_count semaphore limit and block in down(&s->cow_count), holding a read lock on _origins_lock. 3. Someone tries to acquire a write lock on _origins_lock, e.g., snapshot_ctr(), which blocks because the threads at step (2) already hold a read lock on it. 4. A COW operation completes and kcopyd runs dm-snapshot's completion callback, which ends up calling pending_complete(). pending_complete() tries to resubmit any deferred origin bios. This requires acquiring a read lock on _origins_lock, which blocks. This happens because the read-write semaphore implementation gives priority to writers, meaning that as soon as a writer tries to enter the critical section, no readers will be allowed in, until all writers have completed their work. So, pending_complete() waits for the writer at step (3) to acquire and release the lock. This writer waits for the readers at step (2) to release the read lock and those readers wait for pending_complete() (the kcopyd thread) to signal the s->cow_count semaphore: DEADLOCK. The above was thoroughly analyzed and documented by Nikos Tsironis as part of his initial proposal for fixing this deadlock, see: https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html Fix this deadlock by reworking COW throttling so that it waits without holding any locks. Add a variable 'in_progress' that counts how many kcopyd jobs are running. A function wait_for_in_progress() will sleep if 'in_progress' is over the limit. It drops _origins_lock in order to avoid the deadlock. Reported-by: Guruswamy Basavaiah Reported-by: Nikos Tsironis Reviewed-by: Nikos Tsironis Tested-by: Nikos Tsironis Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls") Cc: stable@vger.kernel.org # v5.0+ Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()") Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 78 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 14 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index da3bd1794ee0..4fb1a40e68a0 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "dm.h" @@ -107,8 +106,8 @@ struct dm_snapshot { /* The on disk metadata handler */ struct dm_exception_store *store; - /* Maximum number of in-flight COW jobs. */ - struct semaphore cow_count; + unsigned in_progress; + struct wait_queue_head in_progress_wait; struct dm_kcopyd_client *kcopyd_client; @@ -162,8 +161,8 @@ struct dm_snapshot { */ #define DEFAULT_COW_THRESHOLD 2048 -static int cow_threshold = DEFAULT_COW_THRESHOLD; -module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644); +static unsigned cow_threshold = DEFAULT_COW_THRESHOLD; +module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644); MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write"); DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, @@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_hash_tables; } - sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX); + init_waitqueue_head(&s->in_progress_wait); s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(s->kcopyd_client)) { @@ -1509,17 +1508,54 @@ static void snapshot_dtr(struct dm_target *ti) dm_put_device(ti, s->origin); + WARN_ON(s->in_progress); + kfree(s); } static void account_start_copy(struct dm_snapshot *s) { - down(&s->cow_count); + spin_lock(&s->in_progress_wait.lock); + s->in_progress++; + spin_unlock(&s->in_progress_wait.lock); } static void account_end_copy(struct dm_snapshot *s) { - up(&s->cow_count); + spin_lock(&s->in_progress_wait.lock); + BUG_ON(!s->in_progress); + s->in_progress--; + if (likely(s->in_progress <= cow_threshold) && + unlikely(waitqueue_active(&s->in_progress_wait))) + wake_up_locked(&s->in_progress_wait); + spin_unlock(&s->in_progress_wait.lock); +} + +static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins) +{ + if (unlikely(s->in_progress > cow_threshold)) { + spin_lock(&s->in_progress_wait.lock); + if (likely(s->in_progress > cow_threshold)) { + /* + * NOTE: this throttle doesn't account for whether + * the caller is servicing an IO that will trigger a COW + * so excess throttling may result for chunks not required + * to be COW'd. But if cow_threshold was reached, extra + * throttling is unlikely to negatively impact performance. + */ + DECLARE_WAITQUEUE(wait, current); + __add_wait_queue(&s->in_progress_wait, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&s->in_progress_wait.lock); + if (unlock_origins) + up_read(&_origins_lock); + io_schedule(); + remove_wait_queue(&s->in_progress_wait, &wait); + return false; + } + spin_unlock(&s->in_progress_wait.lock); + } + return true; } /* @@ -1537,7 +1573,7 @@ static void flush_bios(struct bio *bio) } } -static int do_origin(struct dm_dev *origin, struct bio *bio); +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit); /* * Flush a list of buffers. @@ -1550,7 +1586,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) while (bio) { n = bio->bi_next; bio->bi_next = NULL; - r = do_origin(s->origin, bio); + r = do_origin(s->origin, bio, false); if (r == DM_MAPIO_REMAPPED) generic_make_request(bio); bio = n; @@ -1926,6 +1962,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid) return DM_MAPIO_KILL; + if (bio_data_dir(bio) == WRITE) { + while (unlikely(!wait_for_in_progress(s, false))) + ; /* wait_for_in_progress() has slept */ + } + down_read(&s->lock); dm_exception_table_lock(&lock); @@ -2122,7 +2163,7 @@ redirect_to_origin: if (bio_data_dir(bio) == WRITE) { up_write(&s->lock); - return do_origin(s->origin, bio); + return do_origin(s->origin, bio, false); } out_unlock: @@ -2497,15 +2538,24 @@ next_snapshot: /* * Called on a write from the origin driver. */ -static int do_origin(struct dm_dev *origin, struct bio *bio) +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit) { struct origin *o; int r = DM_MAPIO_REMAPPED; +again: down_read(&_origins_lock); o = __lookup_origin(origin->bdev); - if (o) + if (o) { + if (limit) { + struct dm_snapshot *s; + list_for_each_entry(s, &o->snapshots, list) + if (unlikely(!wait_for_in_progress(s, true))) + goto again; + } + r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio); + } up_read(&_origins_lock); return r; @@ -2618,7 +2668,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio) dm_accept_partial_bio(bio, available_sectors); /* Only tell snapshots if this is a write */ - return do_origin(o->dev, bio); + return do_origin(o->dev, bio, true); } /* -- cgit From 3874d73e06c9b9dc15de0b7382fc223986d75571 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 14 Oct 2019 16:58:35 -0700 Subject: md/raid0: fix warning message for parameter default_layout The message should match the parameter, i.e. raid0.default_layout. Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.") Cc: NeilBrown Reported-by: Ivan Topolsky Signed-off-by: Song Liu --- drivers/md/raid0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f61693e59684..1e772287b1c8 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -154,7 +154,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } else { pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", mdname(mddev)); - pr_err("md/raid0: please set raid.default_layout to 1 or 2\n"); + pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); err = -ENOTSUPP; goto abort; } -- cgit From 13bd677a472d534bf100bab2713efc3f9e3f5978 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 16 Oct 2019 09:21:50 -0400 Subject: dm cache: fix bugs when a GFP_NOWAIT allocation fails GFP_NOWAIT allocation can fail anytime - it doesn't wait for memory being available and it fails if the mempool is exhausted and there is not enough memory. If we go down this path: map_bio -> mg_start -> alloc_migration -> mempool_alloc(GFP_NOWAIT) we can see that map_bio() doesn't check the return value of mg_start(), and the bio is leaked. If we go down this path: map_bio -> mg_start -> mg_lock_writes -> alloc_prison_cell -> dm_bio_prison_alloc_cell_v2 -> mempool_alloc(GFP_NOWAIT) -> mg_lock_writes -> mg_complete the bio is ended with an error - it is unacceptable because it could cause filesystem corruption if the machine ran out of memory temporarily. Change GFP_NOWAIT to GFP_NOIO, so that the mempool code will properly wait until memory becomes available. mempool_alloc with GFP_NOIO can't fail, so remove the code paths that deal with allocation failure. Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index d249cf8ac277..8346e6d1816c 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -542,7 +542,7 @@ static void wake_migration_worker(struct cache *cache) static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) { - return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); + return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); } static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) @@ -554,9 +554,7 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache) { struct dm_cache_migration *mg; - mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT); - if (!mg) - return NULL; + mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); memset(mg, 0, sizeof(*mg)); @@ -664,10 +662,6 @@ static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bi struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ - if (!cell_prealloc) { - defer_bio(cache, bio); - return false; - } build_key(oblock, end, &key); r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); @@ -1493,11 +1487,6 @@ static int mg_lock_writes(struct dm_cache_migration *mg) struct dm_bio_prison_cell_v2 *prealloc; prealloc = alloc_prison_cell(cache); - if (!prealloc) { - DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); - mg_complete(mg, false); - return -ENOMEM; - } /* * Prevent writes to the block, but allow reads to continue. @@ -1535,11 +1524,6 @@ static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio } mg = alloc_migration(cache); - if (!mg) { - policy_complete_background_work(cache->policy, op, false); - background_work_end(cache); - return -ENOMEM; - } mg->op = op; mg->overwrite_bio = bio; @@ -1628,10 +1612,6 @@ static int invalidate_lock(struct dm_cache_migration *mg) struct dm_bio_prison_cell_v2 *prealloc; prealloc = alloc_prison_cell(cache); - if (!prealloc) { - invalidate_complete(mg, false); - return -ENOMEM; - } build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); r = dm_cell_lock_v2(cache->prison, &key, @@ -1669,10 +1649,6 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock, return -EPERM; mg = alloc_migration(cache); - if (!mg) { - background_work_end(cache); - return -ENOMEM; - } mg->overwrite_bio = bio; mg->invalidate_cblock = cblock; -- cgit