diff options
Diffstat (limited to 'drivers/md/dm-snap.c')
| -rw-r--r-- | drivers/md/dm-snap.c | 1156 |
1 files changed, 856 insertions, 300 deletions
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c434e5aab2df..f40c18da4000 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * dm-snapshot.c - * * Copyright (C) 2001-2002 Sistina Software (UK) Limited. * * This file is released under the GPL. @@ -13,6 +12,7 @@ #include <linux/init.h> #include <linux/kdev_t.h> #include <linux/list.h> +#include <linux/list_bl.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/slab.h> @@ -20,6 +20,8 @@ #include <linux/log2.h> #include <linux/dm-kcopyd.h> +#include "dm.h" + #include "dm-exception-store.h" #define DM_MSG_PREFIX "snapshots" @@ -40,8 +42,8 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; struct dm_exception_table { uint32_t hash_mask; - unsigned hash_shift; - struct list_head *table; + unsigned int hash_shift; + struct hlist_bl_head *table; }; struct dm_snapshot { @@ -61,12 +63,33 @@ struct dm_snapshot { */ int valid; + /* + * The snapshot overflowed because of a write to the snapshot device. + * We don't have to invalidate the snapshot in this case, but we need + * to prevent further writes. + */ + int snapshot_overflowed; + /* Origin writes don't trigger exceptions until this is set */ int active; atomic_t pending_exceptions_count; - mempool_t *pending_pool; + spinlock_t pe_allocation_lock; + + /* Protected by "pe_allocation_lock" */ + sector_t exception_start_sequence; + + /* Protected by kcopyd single-threaded callback */ + sector_t exception_complete_sequence; + + /* + * A list of pending exceptions that completed out of order. + * Protected by kcopyd single-threaded callback. + */ + struct rb_root out_of_order_tree; + + mempool_t pending_pool; struct dm_exception_table pending; struct dm_exception_table complete; @@ -84,6 +107,9 @@ struct dm_snapshot { /* The on disk metadata handler */ struct dm_exception_store *store; + unsigned int in_progress; + struct wait_queue_head in_progress_wait; + struct dm_kcopyd_client *kcopyd_client; /* Wait for events based on state_bits */ @@ -97,16 +123,19 @@ struct dm_snapshot { * The merge operation failed if this flag is set. * Failure modes are handled as follows: * - I/O error reading the header - * => don't load the target; abort. + * => don't load the target; abort. * - Header does not have "valid" flag set - * => use the origin; forget about the snapshot. + * => use the origin; forget about the snapshot. * - I/O error when reading exceptions - * => don't load the target; abort. + * => don't load the target; abort. * (We can't use the intermediate origin state.) * - I/O error while merging * => stop merging; set merge_failed; process I/O normally. */ - int merge_failed; + bool merge_failed:1; + + bool discard_zeroes_cow:1; + bool discard_passdown_origin:1; /* * Incoming bios that overlap with chunks being merged must wait @@ -124,6 +153,19 @@ struct dm_snapshot { #define RUNNING_MERGE 0 #define SHUTDOWN_MERGE 1 +/* + * Maximum number of chunks being copied on write. + * + * The value was decided experimentally as a trade-off between memory + * consumption, stalling the kernel's workqueues and maintaining a high enough + * throughput. + */ +#define DEFAULT_COW_THRESHOLD 2048 + +static unsigned int cow_threshold = DEFAULT_COW_THRESHOLD; +module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644); +MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write"); + DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, "A percentage of time allocated for copy on write"); @@ -173,12 +215,19 @@ struct dm_snap_pending_exception { */ int started; + /* There was copying error. */ + int copy_error; + + /* A sequence number, it is used for in-order completion. */ + sector_t exception_sequence; + + struct rb_node out_of_order_node; + /* * For writing a complete chunk, bypassing the copy. */ struct bio *full_bio; bio_end_io_t *full_bio_end_io; - void *full_bio_private; }; /* @@ -196,12 +245,14 @@ struct dm_snap_tracked_chunk { static void init_tracked_chunk(struct bio *bio) { struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); + INIT_HLIST_NODE(&c->node); } static bool is_bio_tracked(struct bio *bio) { struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); + return !hlist_unhashed(&c->node); } @@ -249,12 +300,12 @@ static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) /* * This conflicting I/O is extremely improbable in the caller, - * so msleep(1) is sufficient and there is no need for a wait queue. + * so fsleep(1000) is sufficient and there is no need for a wait queue. */ static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk) { while (__chunk_is_tracked(s, chunk)) - msleep(1); + fsleep(1000); } /* @@ -271,12 +322,23 @@ struct origin { }; /* + * This structure is allocated for each origin target + */ +struct dm_origin { + struct dm_dev *dev; + struct dm_target *ti; + unsigned int split_boundary; + struct list_head hash_list; +}; + +/* * Size of the hash table for origin volumes. If we make this * the size of the minors list then it should be nearly perfect */ #define ORIGIN_HASH_SIZE 256 #define ORIGIN_MASK 0xFF static struct list_head *_origins; +static struct list_head *_dm_origins; static struct rw_semaphore _origins_lock; static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done); @@ -287,15 +349,26 @@ static int init_origin_hash(void) { int i; - _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), - GFP_KERNEL); + _origins = kmalloc_array(ORIGIN_HASH_SIZE, sizeof(struct list_head), + GFP_KERNEL); if (!_origins) { - DMERR("unable to allocate memory"); + DMERR("unable to allocate memory for _origins"); return -ENOMEM; } - for (i = 0; i < ORIGIN_HASH_SIZE; i++) INIT_LIST_HEAD(_origins + i); + + _dm_origins = kmalloc_array(ORIGIN_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); + if (!_dm_origins) { + DMERR("unable to allocate memory for _dm_origins"); + kfree(_origins); + return -ENOMEM; + } + for (i = 0; i < ORIGIN_HASH_SIZE; i++) + INIT_LIST_HEAD(_dm_origins + i); + init_rwsem(&_origins_lock); return 0; @@ -304,9 +377,10 @@ static int init_origin_hash(void) static void exit_origin_hash(void) { kfree(_origins); + kfree(_dm_origins); } -static unsigned origin_hash(struct block_device *bdev) +static unsigned int origin_hash(struct block_device *bdev) { return bdev->bd_dev & ORIGIN_MASK; } @@ -317,7 +391,7 @@ static struct origin *__lookup_origin(struct block_device *origin) struct origin *o; ol = &_origins[origin_hash(origin)]; - list_for_each_entry (o, ol, hash_list) + list_for_each_entry(o, ol, hash_list) if (bdev_equal(o->bdev, origin)) return o; @@ -327,9 +401,35 @@ static struct origin *__lookup_origin(struct block_device *origin) static void __insert_origin(struct origin *o) { struct list_head *sl = &_origins[origin_hash(o->bdev)]; + list_add_tail(&o->hash_list, sl); } +static struct dm_origin *__lookup_dm_origin(struct block_device *origin) +{ + struct list_head *ol; + struct dm_origin *o; + + ol = &_dm_origins[origin_hash(origin)]; + list_for_each_entry(o, ol, hash_list) + if (bdev_equal(o->dev->bdev, origin)) + return o; + + return NULL; +} + +static void __insert_dm_origin(struct dm_origin *o) +{ + struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)]; + + list_add_tail(&o->hash_list, sl); +} + +static void __remove_dm_origin(struct dm_origin *o) +{ + list_del(&o->hash_list); +} + /* * _origins_lock must be held when calling this function. * Returns number of snapshots registered using the supplied cow device, plus: @@ -395,8 +495,7 @@ static int __validate_exception_handover(struct dm_snapshot *snap) if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, &snap_merge) == 2) || snap_dest) { - snap->ti->error = "Snapshot cow pairing for exception " - "table handover failed"; + snap->ti->error = "Snapshot cow pairing for exception table handover failed"; return -EINVAL; } @@ -423,8 +522,7 @@ static int __validate_exception_handover(struct dm_snapshot *snap) if (!snap_src->store->type->prepare_merge || !snap_src->store->type->commit_merge) { - snap->ti->error = "Snapshot exception store does not " - "support snapshot-merge."; + snap->ti->error = "Snapshot exception store does not support snapshot-merge."; return -EINVAL; } @@ -526,19 +624,50 @@ static void unregister_snapshot(struct dm_snapshot *s) * The lowest hash_shift bits of the chunk number are ignored, allowing * some consecutive chunks to be grouped together. */ +static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk); + +/* Lock to protect access to the completed and pending exception hash tables. */ +struct dm_exception_table_lock { + struct hlist_bl_head *complete_slot; + struct hlist_bl_head *pending_slot; +}; + +static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk, + struct dm_exception_table_lock *lock) +{ + struct dm_exception_table *complete = &s->complete; + struct dm_exception_table *pending = &s->pending; + + lock->complete_slot = &complete->table[exception_hash(complete, chunk)]; + lock->pending_slot = &pending->table[exception_hash(pending, chunk)]; +} + +static void dm_exception_table_lock(struct dm_exception_table_lock *lock) +{ + hlist_bl_lock(lock->complete_slot); + hlist_bl_lock(lock->pending_slot); +} + +static void dm_exception_table_unlock(struct dm_exception_table_lock *lock) +{ + hlist_bl_unlock(lock->pending_slot); + hlist_bl_unlock(lock->complete_slot); +} + static int dm_exception_table_init(struct dm_exception_table *et, - uint32_t size, unsigned hash_shift) + uint32_t size, unsigned int hash_shift) { unsigned int i; et->hash_shift = hash_shift; et->hash_mask = size - 1; - et->table = dm_vcalloc(size, sizeof(struct list_head)); + et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head), + GFP_KERNEL); if (!et->table) return -ENOMEM; for (i = 0; i < size; i++) - INIT_LIST_HEAD(et->table + i); + INIT_HLIST_BL_HEAD(et->table + i); return 0; } @@ -546,19 +675,22 @@ static int dm_exception_table_init(struct dm_exception_table *et, static void dm_exception_table_exit(struct dm_exception_table *et, struct kmem_cache *mem) { - struct list_head *slot; - struct dm_exception *ex, *next; + struct hlist_bl_head *slot; + struct dm_exception *ex; + struct hlist_bl_node *pos, *n; int i, size; size = et->hash_mask + 1; for (i = 0; i < size; i++) { slot = et->table + i; - list_for_each_entry_safe (ex, next, slot, hash_list) + hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) { kmem_cache_free(mem, ex); + cond_resched(); + } } - vfree(et->table); + kvfree(et->table); } static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) @@ -568,7 +700,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) static void dm_remove_exception(struct dm_exception *e) { - list_del(&e->hash_list); + hlist_bl_del(&e->hash_list); } /* @@ -578,11 +710,12 @@ static void dm_remove_exception(struct dm_exception *e) static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, chunk_t chunk) { - struct list_head *slot; + struct hlist_bl_head *slot; + struct hlist_bl_node *pos; struct dm_exception *e; slot = &et->table[exception_hash(et, chunk)]; - list_for_each_entry (e, slot, hash_list) + hlist_bl_for_each_entry(e, pos, slot, hash_list) if (chunk >= e->old_chunk && chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) return e; @@ -590,12 +723,12 @@ static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, return NULL; } -static struct dm_exception *alloc_completed_exception(void) +static struct dm_exception *alloc_completed_exception(gfp_t gfp) { struct dm_exception *e; - e = kmem_cache_alloc(exception_cache, GFP_NOIO); - if (!e) + e = kmem_cache_alloc(exception_cache, gfp); + if (!e && gfp == GFP_NOIO) e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); return e; @@ -608,7 +741,7 @@ static void free_completed_exception(struct dm_exception *e) static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s) { - struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool, + struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool, GFP_NOIO); atomic_inc(&s->pending_exceptions_count); @@ -621,15 +754,16 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) { struct dm_snapshot *s = pe->snap; - mempool_free(pe, s->pending_pool); - smp_mb__before_atomic_dec(); + mempool_free(pe, &s->pending_pool); + smp_mb__before_atomic(); atomic_dec(&s->pending_exceptions_count); } static void dm_insert_exception(struct dm_exception_table *eh, struct dm_exception *new_e) { - struct list_head *l; + struct hlist_bl_head *l; + struct hlist_bl_node *pos; struct dm_exception *e = NULL; l = &eh->table[exception_hash(eh, new_e->old_chunk)]; @@ -639,7 +773,7 @@ static void dm_insert_exception(struct dm_exception_table *eh, goto out; /* List is ordered by old_chunk */ - list_for_each_entry_reverse(e, l, hash_list) { + hlist_bl_for_each_entry(e, pos, l, hash_list) { /* Insert after an existing chunk? */ if (new_e->old_chunk == (e->old_chunk + dm_consecutive_chunk_count(e) + 1) && @@ -660,12 +794,24 @@ static void dm_insert_exception(struct dm_exception_table *eh, return; } - if (new_e->old_chunk > e->old_chunk) + if (new_e->old_chunk < e->old_chunk) break; } out: - list_add(&new_e->hash_list, e ? &e->hash_list : l); + if (!e) { + /* + * Either the table doesn't support consecutive chunks or slot + * l is empty. + */ + hlist_bl_add_head(&new_e->hash_list, l); + } else if (new_e->old_chunk < e->old_chunk) { + /* Add before an existing exception */ + hlist_bl_add_before(&new_e->hash_list, &e->hash_list); + } else { + /* Add to l's tail: e is the last exception in this slot */ + hlist_bl_add_behind(&new_e->hash_list, &e->hash_list); + } } /* @@ -674,10 +820,11 @@ out: */ static int dm_add_exception(void *context, chunk_t old, chunk_t new) { + struct dm_exception_table_lock lock; struct dm_snapshot *s = context; struct dm_exception *e; - e = alloc_completed_exception(); + e = alloc_completed_exception(GFP_KERNEL); if (!e) return -ENOMEM; @@ -686,7 +833,17 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) /* Consecutive_count is implicitly initialised to zero */ e->new_chunk = new; + /* + * Although there is no need to lock access to the exception tables + * here, if we don't then hlist_bl_add_head(), called by + * dm_insert_exception(), will complain about accessing the + * corresponding list without locking it first. + */ + dm_exception_table_lock_init(s, old, &lock); + + dm_exception_table_lock(&lock); dm_insert_exception(&s->complete, e); + dm_exception_table_unlock(&lock); return 0; } @@ -698,7 +855,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) static uint32_t __minimum_chunk_size(struct origin *o) { struct dm_snapshot *snap; - unsigned chunk_size = 0; + unsigned int chunk_size = rounddown_pow_of_two(UINT_MAX); if (o) list_for_each_entry(snap, &o->snapshots, list) @@ -715,7 +872,8 @@ static int calc_max_buckets(void) { /* use a fixed size of 2MB */ unsigned long mem = 2 * 1024 * 1024; - mem /= sizeof(struct list_head); + + mem /= sizeof(struct hlist_bl_head); return mem; } @@ -725,17 +883,16 @@ static int calc_max_buckets(void) */ static int init_hash_tables(struct dm_snapshot *s) { - sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; + sector_t hash_size, cow_dev_size, max_buckets; /* * Calculate based on the size of the original volume or * the COW volume... */ cow_dev_size = get_dev_size(s->cow->bdev); - origin_dev_size = get_dev_size(s->origin->bdev); max_buckets = calc_max_buckets(); - hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; + hash_size = cow_dev_size >> s->store->chunk_shift; hash_size = min(hash_size, max_buckets); if (hash_size < 64) @@ -764,7 +921,7 @@ static int init_hash_tables(struct dm_snapshot *s) static void merge_shutdown(struct dm_snapshot *s) { clear_bit_unlock(RUNNING_MERGE, &s->state_bits); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&s->state_bits, RUNNING_MERGE); } @@ -786,8 +943,7 @@ static int __remove_single_exception_chunk(struct dm_snapshot *s, e = dm_lookup_exception(&s->complete, old_chunk); if (!e) { - DMERR("Corruption detected: exception for block %llu is " - "on disk but not in memory", + DMERR("Corruption detected: exception for block %llu is on disk but not in memory", (unsigned long long)old_chunk); return -EINVAL; } @@ -814,8 +970,7 @@ static int __remove_single_exception_chunk(struct dm_snapshot *s, e->new_chunk++; } else if (old_chunk != e->old_chunk + dm_consecutive_chunk_count(e)) { - DMERR("Attempt to merge block %llu from the " - "middle of a chunk range [%llu - %llu]", + DMERR("Attempt to merge block %llu from the middle of a chunk range [%llu - %llu]", (unsigned long long)old_chunk, (unsigned long long)e->old_chunk, (unsigned long long) @@ -859,7 +1014,7 @@ out: } static int origin_write_extent(struct dm_snapshot *merging_snap, - sector_t sector, unsigned chunk_size); + sector_t sector, unsigned int chunk_size); static void merge_callback(int read_err, unsigned long write_err, void *context); @@ -908,10 +1063,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) &new_chunk); if (linear_chunks <= 0) { if (linear_chunks < 0) { - DMERR("Read error in exception store: " - "shutting down merge"); + DMERR("Read error in exception store: shutting down merge"); down_write(&s->lock); - s->merge_failed = 1; + s->merge_failed = true; up_write(&s->lock); } goto shut; @@ -984,6 +1138,11 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) goto shut; } + if (blkdev_issue_flush(s->origin->bdev) < 0) { + DMERR("Flush after merge failed: shutting down merge"); + goto shut; + } + if (s->store->type->commit_merge(s->store, s->num_merging_chunks) < 0) { DMERR("Write error in exception store: shutting down merge"); @@ -999,7 +1158,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) shut: down_write(&s->lock); - s->merge_failed = 1; + s->merge_failed = true; b = __release_queued_bios_after_merge(s); up_write(&s->lock); error_bios(b); @@ -1013,54 +1172,105 @@ static void start_merge(struct dm_snapshot *s) snapshot_merge_next_chunks(s); } -static int wait_schedule(void *ptr) -{ - schedule(); - - return 0; -} - /* * Stop the merging process and wait until it finishes. */ static void stop_merge(struct dm_snapshot *s) { set_bit(SHUTDOWN_MERGE, &s->state_bits); - wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule, - TASK_UNINTERRUPTIBLE); + wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE); clear_bit(SHUTDOWN_MERGE, &s->state_bits); } +static int parse_snapshot_features(struct dm_arg_set *as, struct dm_snapshot *s, + struct dm_target *ti) +{ + int r; + unsigned int argc; + const char *arg_name; + + static const struct dm_arg _args[] = { + {0, 2, "Invalid number of feature arguments"}, + }; + + /* + * No feature arguments supplied. + */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + while (argc && !r) { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, "discard_zeroes_cow")) + s->discard_zeroes_cow = true; + + else if (!strcasecmp(arg_name, "discard_passdown_origin")) + s->discard_passdown_origin = true; + + else { + ti->error = "Unrecognised feature requested"; + r = -EINVAL; + break; + } + } + + if (!s->discard_zeroes_cow && s->discard_passdown_origin) { + /* + * TODO: really these are disjoint.. but ti->num_discard_bios + * and dm_bio_get_target_bio_nr() require rigid constraints. + */ + ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow"; + r = -EINVAL; + } + + return r; +} + /* - * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> + * Construct a snapshot mapping: + * <origin_dev> <COW-dev> <p|po|n> <chunk-size> [<# feature args> [<arg>]*] */ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct dm_snapshot *s; + struct dm_arg_set as; int i; int r = -EINVAL; char *origin_path, *cow_path; - unsigned args_used, num_flush_bios = 1; - fmode_t origin_mode = FMODE_READ; + unsigned int args_used, num_flush_bios = 1; + blk_mode_t origin_mode = BLK_OPEN_READ; - if (argc != 4) { - ti->error = "requires exactly 4 arguments"; + if (argc < 4) { + ti->error = "requires 4 or more arguments"; r = -EINVAL; goto bad; } if (dm_target_is_snapshot_merge(ti)) { num_flush_bios = 2; - origin_mode = FMODE_WRITE; + origin_mode = BLK_OPEN_WRITE; } - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) { ti->error = "Cannot allocate private snapshot structure"; r = -ENOMEM; goto bad; } + as.argc = argc; + as.argv = argv; + dm_consume_args(&as, 4); + r = parse_snapshot_features(&as, s, ti); + if (r) + goto bad_features; + origin_path = argv[0]; argv++; argc--; @@ -1080,6 +1290,11 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot get COW device"; goto bad_cow; } + if (s->cow->bdev && s->cow->bdev == s->origin->bdev) { + ti->error = "COW device cannot be the same as origin device"; + r = -EINVAL; + goto bad_store; + } r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); if (r) { @@ -1093,13 +1308,18 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->ti = ti; s->valid = 1; + s->snapshot_overflowed = 0; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); + spin_lock_init(&s->pe_allocation_lock); + s->exception_start_sequence = 0; + s->exception_complete_sequence = 0; + s->out_of_order_tree = RB_ROOT; init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); s->state_bits = 0; - s->merge_failed = 0; + s->merge_failed = false; s->first_merging_chunk = 0; s->num_merging_chunks = 0; bio_list_init(&s->bios_queued_during_merge); @@ -1111,6 +1331,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_hash_tables; } + init_waitqueue_head(&s->in_progress_wait); + s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(s->kcopyd_client)) { r = PTR_ERR(s->kcopyd_client); @@ -1118,10 +1340,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_kcopyd; } - s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache); - if (!s->pending_pool) { + r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache); + if (r) { ti->error = "Could not allocate mempool for pending exceptions"; - r = -ENOMEM; goto bad_pending_pool; } @@ -1132,7 +1353,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = s; ti->num_flush_bios = num_flush_bios; - ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk); + if (s->discard_zeroes_cow) + ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1); + ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk); /* Add snapshot to the list of snapshots for this origin */ /* Exceptions aren't triggered till snapshot_resume() is called */ @@ -1168,6 +1391,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (!s->store->chunk_size) { ti->error = "Chunk size not set"; + r = -EINVAL; goto bad_read_metadata; } @@ -1179,29 +1403,22 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) bad_read_metadata: unregister_snapshot(s); - bad_load_and_register: - mempool_destroy(s->pending_pool); - + mempool_exit(&s->pending_pool); bad_pending_pool: dm_kcopyd_client_destroy(s->kcopyd_client); - bad_kcopyd: dm_exception_table_exit(&s->pending, pending_cache); dm_exception_table_exit(&s->complete, exception_cache); - bad_hash_tables: dm_exception_store_destroy(s->store); - bad_store: dm_put_device(ti, s->cow); - bad_cow: dm_put_device(ti, s->origin); - bad_origin: +bad_features: kfree(s); - bad: return r; } @@ -1232,6 +1449,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src, u.store_swap = snap_dest->store; snap_dest->store = snap_src->store; + snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow; snap_src->store = u.store_swap; snap_dest->store->snap = snap_dest; @@ -1239,6 +1457,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src, snap_dest->ti->max_io_len = snap_dest->store->chunk_size; snap_dest->valid = snap_src->valid; + snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed; /* * Set source invalid to ensure it receives no further I/O. @@ -1273,9 +1492,9 @@ static void snapshot_dtr(struct dm_target *ti) unregister_snapshot(s); while (atomic_read(&s->pending_exceptions_count)) - msleep(1); + fsleep(1000); /* - * Ensure instructions in mempool_destroy aren't reordered + * Ensure instructions in mempool_exit aren't reordered * before atomic_read. */ smp_mb(); @@ -1287,7 +1506,7 @@ static void snapshot_dtr(struct dm_target *ti) __free_exceptions(s); - mempool_destroy(s->pending_pool); + mempool_exit(&s->pending_pool); dm_exception_store_destroy(s->store); @@ -1295,9 +1514,57 @@ static void snapshot_dtr(struct dm_target *ti) dm_put_device(ti, s->origin); + WARN_ON(s->in_progress); + kfree(s); } +static void account_start_copy(struct dm_snapshot *s) +{ + spin_lock(&s->in_progress_wait.lock); + s->in_progress++; + spin_unlock(&s->in_progress_wait.lock); +} + +static void account_end_copy(struct dm_snapshot *s) +{ + spin_lock(&s->in_progress_wait.lock); + BUG_ON(!s->in_progress); + s->in_progress--; + if (likely(s->in_progress <= cow_threshold) && + unlikely(waitqueue_active(&s->in_progress_wait))) + wake_up_locked(&s->in_progress_wait); + spin_unlock(&s->in_progress_wait.lock); +} + +static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins) +{ + if (unlikely(s->in_progress > cow_threshold)) { + spin_lock(&s->in_progress_wait.lock); + if (likely(s->in_progress > cow_threshold)) { + /* + * NOTE: this throttle doesn't account for whether + * the caller is servicing an IO that will trigger a COW + * so excess throttling may result for chunks not required + * to be COW'd. But if cow_threshold was reached, extra + * throttling is unlikely to negatively impact performance. + */ + DECLARE_WAITQUEUE(wait, current); + + __add_wait_queue(&s->in_progress_wait, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&s->in_progress_wait.lock); + if (unlock_origins) + up_read(&_origins_lock); + io_schedule(); + remove_wait_queue(&s->in_progress_wait, &wait); + return false; + } + spin_unlock(&s->in_progress_wait.lock); + } + return true; +} + /* * Flush a list of buffers. */ @@ -1308,12 +1575,12 @@ static void flush_bios(struct bio *bio) while (bio) { n = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + submit_bio_noacct(bio); bio = n; } } -static int do_origin(struct dm_dev *origin, struct bio *bio); +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit); /* * Flush a list of buffers. @@ -1326,9 +1593,9 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) while (bio) { n = bio->bi_next; bio->bi_next = NULL; - r = do_origin(s->origin, bio); + r = do_origin(s->origin, bio, false); if (r == DM_MAPIO_REMAPPED) - generic_make_request(bio); + submit_bio_noacct(bio); bio = n; } } @@ -1366,63 +1633,85 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) dm_table_event(s->ti->table); } -static void pending_complete(struct dm_snap_pending_exception *pe, int success) +static void invalidate_snapshot(struct dm_snapshot *s, int err) { + down_write(&s->lock); + __invalidate_snapshot(s, err); + up_write(&s->lock); +} + +static void pending_complete(void *context, int success) +{ + struct dm_snap_pending_exception *pe = context; struct dm_exception *e; struct dm_snapshot *s = pe->snap; struct bio *origin_bios = NULL; struct bio *snapshot_bios = NULL; struct bio *full_bio = NULL; + struct dm_exception_table_lock lock; int error = 0; + dm_exception_table_lock_init(s, pe->e.old_chunk, &lock); + if (!success) { /* Read/write error - snapshot is unusable */ - down_write(&s->lock); - __invalidate_snapshot(s, -EIO); + invalidate_snapshot(s, -EIO); error = 1; + + dm_exception_table_lock(&lock); goto out; } - e = alloc_completed_exception(); + e = alloc_completed_exception(GFP_NOIO); if (!e) { - down_write(&s->lock); - __invalidate_snapshot(s, -ENOMEM); + invalidate_snapshot(s, -ENOMEM); error = 1; + + dm_exception_table_lock(&lock); goto out; } *e = pe->e; - down_write(&s->lock); + down_read(&s->lock); + dm_exception_table_lock(&lock); if (!s->valid) { + up_read(&s->lock); free_completed_exception(e); error = 1; + goto out; } - /* Check for conflicting reads */ - __check_for_conflicting_io(s, pe->e.old_chunk); - /* - * Add a proper exception, and remove the - * in-flight exception from the list. + * Add a proper exception. After inserting the completed exception all + * subsequent snapshot reads to this chunk will be redirected to the + * COW device. This ensures that we do not starve. Moreover, as long + * as the pending exception exists, neither origin writes nor snapshot + * merging can overwrite the chunk in origin. */ dm_insert_exception(&s->complete, e); + up_read(&s->lock); + + /* Wait for conflicting reads to drain */ + if (__chunk_is_tracked(s, pe->e.old_chunk)) { + dm_exception_table_unlock(&lock); + __check_for_conflicting_io(s, pe->e.old_chunk); + dm_exception_table_lock(&lock); + } out: + /* Remove the in-flight exception from the list */ dm_remove_exception(&pe->e); + + dm_exception_table_unlock(&lock); + snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = bio_list_get(&pe->origin_bios); full_bio = pe->full_bio; - if (full_bio) { + if (full_bio) full_bio->bi_end_io = pe->full_bio_end_io; - full_bio->bi_private = pe->full_bio_private; - } - free_pending_exception(pe); - increment_pending_exceptions_done_count(); - up_write(&s->lock); - /* Submit any pending write bios */ if (error) { if (full_bio) @@ -1430,18 +1719,22 @@ out: error_bios(snapshot_bios); } else { if (full_bio) - bio_endio(full_bio, 0); + bio_endio(full_bio); flush_bios(snapshot_bios); } retry_origin_bios(s, origin_bios); + + free_pending_exception(pe); } -static void commit_callback(void *context, int success) +static void complete_exception(struct dm_snap_pending_exception *pe) { - struct dm_snap_pending_exception *pe = context; + struct dm_snapshot *s = pe->snap; - pending_complete(pe, success); + /* Update the metadata if we are persistent */ + s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error, + pending_complete, pe); } /* @@ -1453,13 +1746,46 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) struct dm_snap_pending_exception *pe = context; struct dm_snapshot *s = pe->snap; - if (read_err || write_err) - pending_complete(pe, 0); + pe->copy_error = read_err || write_err; + + if (pe->exception_sequence == s->exception_complete_sequence) { + struct rb_node *next; + + s->exception_complete_sequence++; + complete_exception(pe); + + next = rb_first(&s->out_of_order_tree); + while (next) { + pe = rb_entry(next, struct dm_snap_pending_exception, + out_of_order_node); + if (pe->exception_sequence != s->exception_complete_sequence) + break; + next = rb_next(next); + s->exception_complete_sequence++; + rb_erase(&pe->out_of_order_node, &s->out_of_order_tree); + complete_exception(pe); + cond_resched(); + } + } else { + struct rb_node *parent = NULL; + struct rb_node **p = &s->out_of_order_tree.rb_node; + struct dm_snap_pending_exception *pe2; + + while (*p) { + pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node); + parent = *p; + + BUG_ON(pe->exception_sequence == pe2->exception_sequence); + if (pe->exception_sequence < pe2->exception_sequence) + p = &((*p)->rb_left); + else + p = &((*p)->rb_right); + } - else - /* Update the metadata if we are persistent */ - s->store->type->commit_exception(s->store, &pe->e, - commit_callback, pe); + rb_link_node(&pe->out_of_order_node, parent, p); + rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree); + } + account_end_copy(s); } /* @@ -1483,14 +1809,15 @@ static void start_copy(struct dm_snap_pending_exception *pe) dest.count = src.count; /* Hand over to kcopyd */ + account_start_copy(s); dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); } -static void full_bio_end_io(struct bio *bio, int error) +static void full_bio_end_io(struct bio *bio) { void *callback_data = bio->bi_private; - dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0); + dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0); } static void start_full_bio(struct dm_snap_pending_exception *pe, @@ -1501,15 +1828,15 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, pe->full_bio = bio; pe->full_bio_end_io = bio->bi_end_io; - pe->full_bio_private = bio->bi_private; + account_start_copy(s); callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, copy_callback, pe); bio->bi_end_io = full_bio_end_io; bio->bi_private = callback_data; - generic_make_request(bio); + submit_bio_noacct(bio); } static struct dm_snap_pending_exception * @@ -1524,50 +1851,98 @@ __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) } /* - * Looks to see if this snapshot already has a pending exception - * for this chunk, otherwise it allocates a new one and inserts - * it into the pending table. + * Inserts a pending exception into the pending table. * - * NOTE: a write lock must be held on snap->lock before calling - * this. + * NOTE: a write lock must be held on the chunk's pending exception table slot + * before calling this. */ static struct dm_snap_pending_exception * -__find_pending_exception(struct dm_snapshot *s, - struct dm_snap_pending_exception *pe, chunk_t chunk) +__insert_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) { - struct dm_snap_pending_exception *pe2; - - pe2 = __lookup_pending_exception(s, chunk); - if (pe2) { - free_pending_exception(pe); - return pe2; - } - pe->e.old_chunk = chunk; bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); pe->started = 0; pe->full_bio = NULL; + spin_lock(&s->pe_allocation_lock); if (s->store->type->prepare_exception(s->store, &pe->e)) { + spin_unlock(&s->pe_allocation_lock); free_pending_exception(pe); return NULL; } + pe->exception_sequence = s->exception_start_sequence++; + spin_unlock(&s->pe_allocation_lock); + dm_insert_exception(&s->pending, &pe->e); return pe; } +/* + * Looks to see if this snapshot already has a pending exception + * for this chunk, otherwise it allocates a new one and inserts + * it into the pending table. + * + * NOTE: a write lock must be held on the chunk's pending exception table slot + * before calling this. + */ +static struct dm_snap_pending_exception * +__find_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) +{ + struct dm_snap_pending_exception *pe2; + + pe2 = __lookup_pending_exception(s, chunk); + if (pe2) { + free_pending_exception(pe); + return pe2; + } + + return __insert_pending_exception(s, pe, chunk); +} + static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, struct bio *bio, chunk_t chunk) { - bio->bi_bdev = s->cow->bdev; - bio->bi_sector = chunk_to_sector(s->store, - dm_chunk_number(e->new_chunk) + - (chunk - e->old_chunk)) + - (bio->bi_sector & - s->store->chunk_mask); + bio_set_dev(bio, s->cow->bdev); + bio->bi_iter.bi_sector = + chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) + + (chunk - e->old_chunk)) + + (bio->bi_iter.bi_sector & s->store->chunk_mask); +} + +static void zero_callback(int read_err, unsigned long write_err, void *context) +{ + struct bio *bio = context; + struct dm_snapshot *s = bio->bi_private; + + account_end_copy(s); + bio->bi_status = write_err ? BLK_STS_IOERR : 0; + bio_endio(bio); +} + +static void zero_exception(struct dm_snapshot *s, struct dm_exception *e, + struct bio *bio, chunk_t chunk) +{ + struct dm_io_region dest; + + dest.bdev = s->cow->bdev; + dest.sector = bio->bi_iter.bi_sector; + dest.count = s->store->chunk_size; + + account_start_copy(s); + WARN_ON_ONCE(bio->bi_private); + bio->bi_private = s; + dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio); +} + +static bool io_overlaps_chunk(struct dm_snapshot *s, struct bio *bio) +{ + return bio->bi_iter.bi_size == + (s->store->chunk_size << SECTOR_SHIFT); } static int snapshot_map(struct dm_target *ti, struct bio *bio) @@ -1577,54 +1952,88 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) int r = DM_MAPIO_REMAPPED; chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; + struct dm_exception_table_lock lock; init_tracked_chunk(bio); - if (bio->bi_rw & REQ_FLUSH) { - bio->bi_bdev = s->cow->bdev; + if (bio->bi_opf & REQ_PREFLUSH) { + bio_set_dev(bio, s->cow->bdev); return DM_MAPIO_REMAPPED; } - chunk = sector_to_chunk(s->store, bio->bi_sector); + chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); + dm_exception_table_lock_init(s, chunk, &lock); /* Full snapshots are not usable */ /* To get here the table must be live so s->active is always set. */ if (!s->valid) - return -EIO; + return DM_MAPIO_KILL; - /* FIXME: should only take write lock if we need - * to copy an exception */ - down_write(&s->lock); + if (bio_data_dir(bio) == WRITE) { + while (unlikely(!wait_for_in_progress(s, false))) + ; /* wait_for_in_progress() has slept */ + } - if (!s->valid) { - r = -EIO; + down_read(&s->lock); + dm_exception_table_lock(&lock); + + if (!s->valid || (unlikely(s->snapshot_overflowed) && + bio_data_dir(bio) == WRITE)) { + r = DM_MAPIO_KILL; goto out_unlock; } + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) { + /* + * passdown discard to origin (without triggering + * snapshot exceptions via do_origin; doing so would + * defeat the goal of freeing space in origin that is + * implied by the "discard_passdown_origin" feature) + */ + bio_set_dev(bio, s->origin->bdev); + track_chunk(s, bio, chunk); + goto out_unlock; + } + /* discard to snapshot (target_bio_nr == 0) zeroes exceptions */ + } + /* If the block is already remapped - use that, else remap it */ e = dm_lookup_exception(&s->complete, chunk); if (e) { remap_exception(s, e, bio, chunk); + if (unlikely(bio_op(bio) == REQ_OP_DISCARD) && + io_overlaps_chunk(s, bio)) { + dm_exception_table_unlock(&lock); + up_read(&s->lock); + zero_exception(s, e, bio, chunk); + r = DM_MAPIO_SUBMITTED; /* discard is not issued */ + goto out; + } + goto out_unlock; + } + + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + /* + * If no exception exists, complete discard immediately + * otherwise it'll trigger copy-out. + */ + bio_endio(bio); + r = DM_MAPIO_SUBMITTED; goto out_unlock; } /* * Write to snapshot - higher level takes care of RW/RO * flags so we should only get this if we are - * writeable. + * writable. */ - if (bio_rw(bio) == WRITE) { + if (bio_data_dir(bio) == WRITE) { pe = __lookup_pending_exception(s, chunk); if (!pe) { - up_write(&s->lock); + dm_exception_table_unlock(&lock); pe = alloc_pending_exception(s); - down_write(&s->lock); - - if (!s->valid) { - free_pending_exception(pe); - r = -EIO; - goto out_unlock; - } + dm_exception_table_lock(&lock); e = dm_lookup_exception(&s->complete, chunk); if (e) { @@ -1635,9 +2044,22 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) pe = __find_pending_exception(s, pe, chunk); if (!pe) { - __invalidate_snapshot(s, -ENOMEM); - r = -EIO; - goto out_unlock; + dm_exception_table_unlock(&lock); + up_read(&s->lock); + + down_write(&s->lock); + + if (s->store->userspace_supports_overflow) { + if (s->valid && !s->snapshot_overflowed) { + s->snapshot_overflowed = 1; + DMERR("Snapshot overflowed: Unable to allocate exception."); + } + } else + __invalidate_snapshot(s, -ENOMEM); + up_write(&s->lock); + + r = DM_MAPIO_KILL; + goto out; } } @@ -1645,10 +2067,12 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) r = DM_MAPIO_SUBMITTED; - if (!pe->started && - bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { + if (!pe->started && io_overlaps_chunk(s, bio)) { pe->started = 1; - up_write(&s->lock); + + dm_exception_table_unlock(&lock); + up_read(&s->lock); + start_full_bio(pe, bio); goto out; } @@ -1656,19 +2080,23 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) bio_list_add(&pe->snapshot_bios, bio); if (!pe->started) { - /* this is protected by snap->lock */ + /* this is protected by the exception table lock */ pe->started = 1; - up_write(&s->lock); + + dm_exception_table_unlock(&lock); + up_read(&s->lock); + start_copy(pe); goto out; } } else { - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); track_chunk(s, bio, chunk); } out_unlock: - up_write(&s->lock); + dm_exception_table_unlock(&lock); + up_read(&s->lock); out: return r; } @@ -1694,15 +2122,21 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) init_tracked_chunk(bio); - if (bio->bi_rw & REQ_FLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { if (!dm_bio_get_target_bio_nr(bio)) - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); else - bio->bi_bdev = s->cow->bdev; + bio_set_dev(bio, s->cow->bdev); return DM_MAPIO_REMAPPED; } - chunk = sector_to_chunk(s->store, bio->bi_sector); + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + /* Once merging, discards no longer effect change */ + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); down_write(&s->lock); @@ -1714,11 +2148,11 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) e = dm_lookup_exception(&s->complete, chunk); if (e) { /* Queue writes overlapping with chunks being merged */ - if (bio_rw(bio) == WRITE && + if (bio_data_dir(bio) == WRITE && chunk >= s->first_merging_chunk && chunk < (s->first_merging_chunk + s->num_merging_chunks)) { - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); bio_list_add(&s->bios_queued_during_merge, bio); r = DM_MAPIO_SUBMITTED; goto out_unlock; @@ -1726,17 +2160,17 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) remap_exception(s, e, bio, chunk); - if (bio_rw(bio) == WRITE) + if (bio_data_dir(bio) == WRITE) track_chunk(s, bio, chunk); goto out_unlock; } redirect_to_origin: - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); - if (bio_rw(bio) == WRITE) { + if (bio_data_dir(bio) == WRITE) { up_write(&s->lock); - return do_origin(s->origin, bio); + return do_origin(s->origin, bio, false); } out_unlock: @@ -1745,14 +2179,15 @@ out_unlock: return r; } -static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error) +static int snapshot_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct dm_snapshot *s = ti->private; if (is_bio_tracked(bio)) stop_tracking_chunk(s, bio); - return 0; + return DM_ENDIO_DONE; } static void snapshot_merge_presuspend(struct dm_target *ti) @@ -1773,12 +2208,10 @@ static int snapshot_preresume(struct dm_target *ti) if (snap_src && snap_dest) { down_read(&snap_src->lock); if (s == snap_src) { - DMERR("Unable to resume snapshot source until " - "handover completes."); + DMERR("Unable to resume snapshot source until handover completes."); r = -EINVAL; } else if (!dm_suspended(snap_src->ti)) { - DMERR("Unable to perform snapshot handover until " - "source is suspended."); + DMERR("Unable to perform snapshot handover until source is suspended."); r = -EINVAL; } up_read(&snap_src->lock); @@ -1791,9 +2224,40 @@ static int snapshot_preresume(struct dm_target *ti) static void snapshot_resume(struct dm_target *ti) { struct dm_snapshot *s = ti->private; - struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL; + struct dm_origin *o; + struct mapped_device *origin_md = NULL; + bool must_restart_merging = false; + + down_read(&_origins_lock); + + o = __lookup_dm_origin(s->origin->bdev); + if (o) + origin_md = dm_table_get_md(o->ti->table); + if (!origin_md) { + (void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging); + if (snap_merging) + origin_md = dm_table_get_md(snap_merging->ti->table); + } + if (origin_md == dm_table_get_md(ti->table)) + origin_md = NULL; + if (origin_md) { + if (dm_hold(origin_md)) + origin_md = NULL; + } + + up_read(&_origins_lock); + + if (origin_md) { + dm_internal_suspend_fast(origin_md); + if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) { + must_restart_merging = true; + stop_merge(snap_merging); + } + } down_read(&_origins_lock); + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { down_write(&snap_src->lock); @@ -1802,8 +2266,16 @@ static void snapshot_resume(struct dm_target *ti) up_write(&snap_dest->lock); up_write(&snap_src->lock); } + up_read(&_origins_lock); + if (origin_md) { + if (must_restart_merging) + start_merge(snap_merging); + dm_internal_resume_fast(origin_md); + dm_put(origin_md); + } + /* Now we have correct chunk size, reregister */ reregister_snapshot(s); @@ -1841,10 +2313,11 @@ static void snapshot_merge_resume(struct dm_target *ti) } static void snapshot_status(struct dm_target *ti, status_type_t type, - unsigned status_flags, char *result, unsigned maxlen) + unsigned int status_flags, char *result, unsigned int maxlen) { - unsigned sz = 0; + unsigned int sz = 0; struct dm_snapshot *snap = ti->private; + unsigned int num_features; switch (type) { case STATUSTYPE_INFO: @@ -1855,6 +2328,8 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, DMEMIT("Invalid"); else if (snap->merge_failed) DMEMIT("Merge failed"); + else if (snap->snapshot_overflowed) + DMEMIT("Overflow"); else { if (snap->store->type->usage) { sector_t total_sectors, sectors_allocated, @@ -1867,8 +2342,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, (unsigned long long)sectors_allocated, (unsigned long long)total_sectors, (unsigned long long)metadata_sectors); - } - else + } else DMEMIT("Unknown"); } @@ -1883,8 +2357,26 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, * make sense. */ DMEMIT("%s %s", snap->origin->name, snap->cow->name); - snap->store->type->status(snap->store, type, result + sz, - maxlen - sz); + sz += snap->store->type->status(snap->store, type, result + sz, + maxlen - sz); + num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin; + if (num_features) { + DMEMIT(" %u", num_features); + if (snap->discard_zeroes_cow) + DMEMIT(" discard_zeroes_cow"); + if (snap->discard_passdown_origin) + DMEMIT(" discard_passdown_origin"); + } + break; + + case STATUSTYPE_IMA: + DMEMIT_TARGET_NAME_VERSION(ti->type); + DMEMIT(",snap_origin_name=%s", snap->origin->name); + DMEMIT(",snap_cow_name=%s", snap->cow->name); + DMEMIT(",snap_valid=%c", snap->valid ? 'y' : 'n'); + DMEMIT(",snap_merge_failed=%c", snap->merge_failed ? 'y' : 'n'); + DMEMIT(",snapshot_overflowed=%c", snap->snapshot_overflowed ? 'y' : 'n'); + DMEMIT(";"); break; } } @@ -1903,12 +2395,33 @@ static int snapshot_iterate_devices(struct dm_target *ti, return r; } +static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dm_snapshot *snap = ti->private; + + if (snap->discard_zeroes_cow) { + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; -/*----------------------------------------------------------------- - * Origin methods - *---------------------------------------------------------------*/ + down_read(&_origins_lock); + + (void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest) + snap = snap_src; + + /* All discards are split on chunk_size boundary */ + limits->discard_granularity = snap->store->chunk_size; + limits->max_hw_discard_sectors = snap->store->chunk_size; + + up_read(&_origins_lock); + } +} /* + *--------------------------------------------------------------- + * Origin methods + *--------------------------------------------------------------- + */ +/* * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any * supplied bio was ignored. The caller may submit it immediately. * (No remapping actually occurs as the origin is always a direct linear @@ -1924,13 +2437,14 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, int r = DM_MAPIO_REMAPPED; struct dm_snapshot *snap; struct dm_exception *e; - struct dm_snap_pending_exception *pe; + struct dm_snap_pending_exception *pe, *pe2; struct dm_snap_pending_exception *pe_to_start_now = NULL; struct dm_snap_pending_exception *pe_to_start_last = NULL; + struct dm_exception_table_lock lock; chunk_t chunk; /* Do all the snapshots on this origin */ - list_for_each_entry (snap, snapshots, list) { + list_for_each_entry(snap, snapshots, list) { /* * Don't make new exceptions in a merging snapshot * because it has effectively been deleted @@ -1938,52 +2452,59 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, if (dm_target_is_snapshot_merge(snap->ti)) continue; - down_write(&snap->lock); - - /* Only deal with valid and active snapshots */ - if (!snap->valid || !snap->active) - goto next_snapshot; - /* Nothing to do if writing beyond end of snapshot */ if (sector >= dm_table_get_size(snap->ti->table)) - goto next_snapshot; + continue; /* * Remember, different snapshots can have * different chunk sizes. */ chunk = sector_to_chunk(snap->store, sector); + dm_exception_table_lock_init(snap, chunk, &lock); - /* - * Check exception table to see if block - * is already remapped in this snapshot - * and trigger an exception if not. - */ - e = dm_lookup_exception(&snap->complete, chunk); - if (e) + down_read(&snap->lock); + dm_exception_table_lock(&lock); + + /* Only deal with valid and active snapshots */ + if (!snap->valid || !snap->active) goto next_snapshot; pe = __lookup_pending_exception(snap, chunk); if (!pe) { - up_write(&snap->lock); - pe = alloc_pending_exception(snap); - down_write(&snap->lock); - - if (!snap->valid) { - free_pending_exception(pe); - goto next_snapshot; - } - + /* + * Check exception table to see if block is already + * remapped in this snapshot and trigger an exception + * if not. + */ e = dm_lookup_exception(&snap->complete, chunk); - if (e) { - free_pending_exception(pe); + if (e) goto next_snapshot; - } - pe = __find_pending_exception(snap, pe, chunk); - if (!pe) { - __invalidate_snapshot(snap, -ENOMEM); - goto next_snapshot; + dm_exception_table_unlock(&lock); + pe = alloc_pending_exception(snap); + dm_exception_table_lock(&lock); + + pe2 = __lookup_pending_exception(snap, chunk); + + if (!pe2) { + e = dm_lookup_exception(&snap->complete, chunk); + if (e) { + free_pending_exception(pe); + goto next_snapshot; + } + + pe = __insert_pending_exception(snap, pe, chunk); + if (!pe) { + dm_exception_table_unlock(&lock); + up_read(&snap->lock); + + invalidate_snapshot(snap, -ENOMEM); + continue; + } + } else { + free_pending_exception(pe); + pe = pe2; } } @@ -2010,7 +2531,8 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, } next_snapshot: - up_write(&snap->lock); + dm_exception_table_unlock(&lock); + up_read(&snap->lock); if (pe_to_start_now) { start_copy(pe_to_start_now); @@ -2031,15 +2553,25 @@ next_snapshot: /* * Called on a write from the origin driver. */ -static int do_origin(struct dm_dev *origin, struct bio *bio) +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit) { struct origin *o; int r = DM_MAPIO_REMAPPED; +again: down_read(&_origins_lock); o = __lookup_origin(origin->bdev); - if (o) - r = __origin_write(&o->snapshots, bio->bi_sector, bio); + if (o) { + if (limit) { + struct dm_snapshot *s; + + list_for_each_entry(s, &o->snapshots, list) + if (unlikely(!wait_for_in_progress(s, true))) + goto again; + } + + r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio); + } up_read(&_origins_lock); return r; @@ -2059,7 +2591,7 @@ static int do_origin(struct dm_dev *origin, struct bio *bio) * size must be a multiple of merging_snap's chunk_size. */ static int origin_write_extent(struct dm_snapshot *merging_snap, - sector_t sector, unsigned size) + sector_t sector, unsigned int size) { int must_wait = 0; sector_t n; @@ -2092,41 +2624,67 @@ static int origin_write_extent(struct dm_snapshot *merging_snap, static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int r; - struct dm_dev *dev; + struct dm_origin *o; if (argc != 1) { ti->error = "origin: incorrect number of arguments"; return -EINVAL; } - r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); + o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL); + if (!o) { + ti->error = "Cannot allocate private origin structure"; + r = -ENOMEM; + goto bad_alloc; + } + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev); if (r) { ti->error = "Cannot get target device"; - return r; + goto bad_open; } - ti->private = dev; + o->ti = ti; + ti->private = o; ti->num_flush_bios = 1; return 0; + +bad_open: + kfree(o); +bad_alloc: + return r; } static void origin_dtr(struct dm_target *ti) { - struct dm_dev *dev = ti->private; - dm_put_device(ti, dev); + struct dm_origin *o = ti->private; + + dm_put_device(ti, o->dev); + kfree(o); } static int origin_map(struct dm_target *ti, struct bio *bio) { - struct dm_dev *dev = ti->private; - bio->bi_bdev = dev->bdev; + struct dm_origin *o = ti->private; + unsigned int available_sectors; + + bio_set_dev(bio, o->dev->bdev); - if (bio->bi_rw & REQ_FLUSH) + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) return DM_MAPIO_REMAPPED; + if (bio_data_dir(bio) != WRITE) + return DM_MAPIO_REMAPPED; + + available_sectors = o->split_boundary - + ((unsigned int)bio->bi_iter.bi_sector & (o->split_boundary - 1)); + + if (bio_sectors(bio) > available_sectors) + dm_accept_partial_bio(bio, available_sectors); + /* Only tell snapshots if this is a write */ - return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; + return do_origin(o->dev, bio, true); } /* @@ -2135,15 +2693,28 @@ static int origin_map(struct dm_target *ti, struct bio *bio) */ static void origin_resume(struct dm_target *ti) { - struct dm_dev *dev = ti->private; + struct dm_origin *o = ti->private; + + o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev); - ti->max_io_len = get_origin_minimum_chunksize(dev->bdev); + down_write(&_origins_lock); + __insert_dm_origin(o); + up_write(&_origins_lock); +} + +static void origin_postsuspend(struct dm_target *ti) +{ + struct dm_origin *o = ti->private; + + down_write(&_origins_lock); + __remove_dm_origin(o); + up_write(&_origins_lock); } static void origin_status(struct dm_target *ti, status_type_t type, - unsigned status_flags, char *result, unsigned maxlen) + unsigned int status_flags, char *result, unsigned int maxlen) { - struct dm_dev *dev = ti->private; + struct dm_origin *o = ti->private; switch (type) { case STATUSTYPE_INFO: @@ -2151,49 +2722,38 @@ static void origin_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - snprintf(result, maxlen, "%s", dev->name); + snprintf(result, maxlen, "%s", o->dev->name); + break; + case STATUSTYPE_IMA: + result[0] = '\0'; break; } } -static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct dm_dev *dev = ti->private; - struct request_queue *q = bdev_get_queue(dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = dev->bdev; - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - static int origin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { - struct dm_dev *dev = ti->private; + struct dm_origin *o = ti->private; - return fn(ti, dev, 0, ti->len, data); + return fn(ti, o->dev, 0, ti->len, data); } static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 8, 1}, + .version = {1, 9, 0}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, .map = origin_map, .resume = origin_resume, + .postsuspend = origin_postsuspend, .status = origin_status, - .merge = origin_merge, .iterate_devices = origin_iterate_devices, }; static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 11, 1}, + .version = {1, 16, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, @@ -2203,11 +2763,12 @@ static struct target_type snapshot_target = { .resume = snapshot_resume, .status = snapshot_status, .iterate_devices = snapshot_iterate_devices, + .io_hints = snapshot_io_hints, }; static struct target_type merge_target = { .name = dm_snapshot_merge_target_name, - .version = {1, 2, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, @@ -2218,6 +2779,7 @@ static struct target_type merge_target = { .resume = snapshot_merge_resume, .status = snapshot_status, .iterate_devices = snapshot_iterate_devices, + .io_hints = snapshot_io_hints, }; static int __init dm_snapshot_init(void) @@ -2230,24 +2792,6 @@ static int __init dm_snapshot_init(void) return r; } - r = dm_register_target(&snapshot_target); - if (r < 0) { - DMERR("snapshot target register failed %d", r); - goto bad_register_snapshot_target; - } - - r = dm_register_target(&origin_target); - if (r < 0) { - DMERR("Origin target register failed %d", r); - goto bad_register_origin_target; - } - - r = dm_register_target(&merge_target); - if (r < 0) { - DMERR("Merge target register failed %d", r); - goto bad_register_merge_target; - } - r = init_origin_hash(); if (r) { DMERR("init_origin_hash failed."); @@ -2268,19 +2812,31 @@ static int __init dm_snapshot_init(void) goto bad_pending_cache; } + r = dm_register_target(&snapshot_target); + if (r < 0) + goto bad_register_snapshot_target; + + r = dm_register_target(&origin_target); + if (r < 0) + goto bad_register_origin_target; + + r = dm_register_target(&merge_target); + if (r < 0) + goto bad_register_merge_target; + return 0; -bad_pending_cache: - kmem_cache_destroy(exception_cache); -bad_exception_cache: - exit_origin_hash(); -bad_origin_hash: - dm_unregister_target(&merge_target); bad_register_merge_target: dm_unregister_target(&origin_target); bad_register_origin_target: dm_unregister_target(&snapshot_target); bad_register_snapshot_target: + kmem_cache_destroy(pending_cache); +bad_pending_cache: + kmem_cache_destroy(exception_cache); +bad_exception_cache: + exit_origin_hash(); +bad_origin_hash: dm_exception_store_exit(); return r; |
