diff options
Diffstat (limited to 'drivers/md')
125 files changed, 11663 insertions, 3015 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 1e9db8e4acdf..104aa5355090 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -37,6 +37,32 @@ config BLK_DEV_MD If unsure, say N. +config MD_BITMAP + bool "MD RAID bitmap support" + default y + depends on BLK_DEV_MD + help + If you say Y here, support for the write intent bitmap will be + enabled. The bitmap can be used to optimize resync speed after power + failure or readding a disk, limiting it to recorded dirty sectors in + bitmap. + + This feature can be added to existing MD array or MD array can be + created with bitmap via mdadm(8). + + If unsure, say Y. + +config MD_LLBITMAP + bool "MD RAID lockless bitmap support" + depends on BLK_DEV_MD + help + If you say Y here, support for the lockless write intent bitmap will + be enabled. + + Note, this is an experimental feature. + + If unsure, say N. + config MD_AUTODETECT bool "Autodetect RAID arrays during kernel boot" depends on BLK_DEV_MD=y @@ -54,6 +80,7 @@ config MD_AUTODETECT config MD_BITMAP_FILE bool "MD bitmap file support (deprecated)" default y + depends on MD_BITMAP help If you say Y here, support for write intent bitmaps in files on an external file system is enabled. This is an alternative to the internal @@ -61,6 +88,19 @@ config MD_BITMAP_FILE various kernel APIs and can only work with files on a file system not actually sitting on the MD device. +config MD_LINEAR + tristate "Linear (append) mode" + depends on BLK_DEV_MD + help + If you say Y here, then your multiple devices driver will be able to + use the so-called linear mode, i.e. it will combine the hard disk + partitions by simply appending one to the other. + + To compile this as a module, choose M here: the module + will be called linear. + + If unsure, say Y. + config MD_RAID0 tristate "RAID-0 (striping) mode" depends on BLK_DEV_MD @@ -126,7 +166,7 @@ config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD select RAID6_PQ - select LIBCRC32C + select CRC32 select ASYNC_MEMCPY select ASYNC_XOR select ASYNC_PQ @@ -161,6 +201,7 @@ config MD_RAID456 config MD_CLUSTER tristate "Cluster Support for MD" + select MD_BITMAP depends on BLK_DEV_MD depends on DLM default n @@ -254,6 +295,7 @@ config DM_CRYPT depends on BLK_DEV_DM depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) depends on (TRUSTED_KEYS || TRUSTED_KEYS=n) + select CRC32 select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV @@ -379,6 +421,7 @@ config DM_RAID select MD_RAID1 select MD_RAID10 select MD_RAID456 + select MD_BITMAP select BLK_DEV_MD help A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings @@ -645,4 +688,6 @@ config DM_AUDIT source "drivers/md/dm-vdo/Kconfig" +source "drivers/md/dm-pcache/Kconfig" + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 476a214e4bdc..c338cc6fbe2e 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -27,14 +27,18 @@ dm-clone-y += dm-clone-target.o dm-clone-metadata.o dm-verity-y += dm-verity-target.o dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o -md-mod-y += md.o md-bitmap.o +md-mod-y += md.o +md-mod-$(CONFIG_MD_BITMAP) += md-bitmap.o +md-mod-$(CONFIG_MD_LLBITMAP) += md-llbitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o +linear-y += md-linear.o # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise # themselves, and md.o may use the personalities when it # auto-initialised. +obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o @@ -69,6 +73,7 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_VDO) += dm-vdo/ +obj-$(CONFIG_DM_PCACHE) += dm-pcache/ obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_EBS) += dm-ebs.o diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index d4697e79d5a3..b2d10063d35f 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -5,7 +5,6 @@ config BCACHE select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 select CLOSURES - select MIN_HEAP help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 8998e61efa40..7708d92df23e 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -24,21 +24,18 @@ * Since the gens and priorities are all stored contiguously on disk, we can * batch this up: We fill up the free_inc list with freshly invalidated buckets, * call prio_write(), and when prio_write() finishes we pull buckets off the - * free_inc list and optionally discard them. + * free_inc list. * * free_inc isn't the only freelist - if it was, we'd often to sleep while * priorities and gens were being written before we could allocate. c->free is a * smaller freelist, and buckets on that list are always ready to be used. * - * If we've got discards enabled, that happens when a bucket moves from the - * free_inc list to the free list. - * * There is another freelist, because sometimes we have buckets that we know * have nothing pointing into them - these we can reuse without waiting for * priorities to be rewritten. These come from freed btree nodes and buckets * that garbage collection discovered no longer had valid keys pointing into * them (because they were overwritten). That's the unused list - buckets on the - * unused list move to the free list, optionally being discarded in the process. + * unused list move to the free list. * * It's also important to ensure that gens don't wrap around - with respect to * either the oldest gen in the btree or the gen on disk. This is quite @@ -118,8 +115,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) /* * Background allocation thread: scans for buckets to be invalidated, * invalidates them, rewrites prios/gens (marking them as invalidated on disk), - * then optionally issues discard commands to the newly free buckets, then puts - * them on the various freelists. + * then puts them on the various freelists. */ static inline bool can_inc_bucket_gen(struct bucket *b) @@ -164,61 +160,40 @@ static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) * prio is worth 1/8th of what INITIAL_PRIO is worth. */ -static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b) -{ - unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; - - return (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); -} - -static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args) -{ - struct bucket **lhs = (struct bucket **)l; - struct bucket **rhs = (struct bucket **)r; - struct cache *ca = args; - - return new_bucket_prio(ca, *lhs) > new_bucket_prio(ca, *rhs); -} - -static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args) -{ - struct bucket **lhs = (struct bucket **)l; - struct bucket **rhs = (struct bucket **)r; - struct cache *ca = args; +#define bucket_prio(b) \ +({ \ + unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + \ + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ +}) - return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs); -} +#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) +#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) static void invalidate_buckets_lru(struct cache *ca) { struct bucket *b; - const struct min_heap_callbacks bucket_max_cmp_callback = { - .less = new_bucket_max_cmp, - .swp = NULL, - }; - const struct min_heap_callbacks bucket_min_cmp_callback = { - .less = new_bucket_min_cmp, - .swp = NULL, - }; + ssize_t i; - ca->heap.nr = 0; + ca->heap.used = 0; for_each_bucket(b, ca) { if (!bch_can_invalidate_bucket(ca, b)) continue; - if (!min_heap_full(&ca->heap)) - min_heap_push(&ca->heap, &b, &bucket_max_cmp_callback, ca); - else if (!new_bucket_max_cmp(&b, min_heap_peek(&ca->heap), ca)) { + if (!heap_full(&ca->heap)) + heap_add(&ca->heap, b, bucket_max_cmp); + else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { ca->heap.data[0] = b; - min_heap_sift_down(&ca->heap, 0, &bucket_max_cmp_callback, ca); + heap_sift(&ca->heap, 0, bucket_max_cmp); } } - min_heapify_all(&ca->heap, &bucket_min_cmp_callback, ca); + for (i = ca->heap.used / 2 - 1; i >= 0; --i) + heap_sift(&ca->heap, i, bucket_min_cmp); while (!fifo_full(&ca->free_inc)) { - if (!ca->heap.nr) { + if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { /* * We don't want to be calling invalidate_buckets() * multiple times when it can't do anything @@ -227,8 +202,6 @@ static void invalidate_buckets_lru(struct cache *ca) wake_up_gc(ca->set); return; } - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &bucket_min_cmp_callback, ca); bch_invalidate_one_bucket(ca, b); } @@ -344,8 +317,7 @@ static int bch_allocator_thread(void *arg) while (1) { /* * First, we pull buckets off of the unused and free_inc lists, - * possibly issue discards to them, then we add the bucket to - * the free list: + * then we add the bucket to the free list: */ while (1) { long bucket; @@ -353,14 +325,6 @@ static int bch_allocator_thread(void *arg) if (!fifo_pop(&ca->free_inc, bucket)) break; - if (ca->discard) { - mutex_unlock(&ca->set->bucket_lock); - blkdev_issue_discard(ca->bdev, - bucket_to_sector(ca->set, bucket), - ca->sb.bucket_size, GFP_KERNEL); - mutex_lock(&ca->set->bucket_lock); - } - allocator_wait(ca, bch_allocator_push(ca, bucket)); wake_up(&ca->set->btree_cache_wait); wake_up(&ca->set->bucket_wait); @@ -435,7 +399,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) TASK_UNINTERRUPTIBLE); mutex_unlock(&ca->set->bucket_lock); + + atomic_inc(&ca->set->bucket_wait_cnt); schedule(); + atomic_dec(&ca->set->bucket_wait_cnt); + mutex_lock(&ca->set->bucket_lock); } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && !fifo_pop(&ca->free[reserve], r)); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 785b0d9008fa..8ccacba85547 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -447,8 +447,7 @@ struct cache { * free_inc: Incoming buckets - these are buckets that currently have * cached data in them, and we can't reuse them until after we write * their new gen to disk. After prio_write() finishes writing the new - * gens/prios, they'll be moved to the free list (and possibly discarded - * in the process) + * gens/prios, they'll be moved to the free list. */ DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); @@ -458,7 +457,7 @@ struct cache { /* Allocation stuff: */ struct bucket *buckets; - DEFINE_MIN_HEAP(struct bucket *, cache_heap) heap; + DECLARE_HEAP(struct bucket *, heap); /* * If nonzero, we know we aren't going to find any buckets to invalidate @@ -467,8 +466,6 @@ struct cache { */ unsigned int invalidate_needs_gc; - bool discard; /* Get rid of? */ - struct journal_device journal; /* The rest of this all shows up in sysfs */ @@ -607,6 +604,7 @@ struct cache_set { */ atomic_t prio_blocked; wait_queue_head_t bucket_wait; + atomic_t bucket_wait_cnt; /* * For any bio we don't skip we subtract the number of sectors from diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 68258a16e125..463eb13bd0b2 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -54,11 +54,9 @@ void bch_dump_bucket(struct btree_keys *b) int __bch_count_data(struct btree_keys *b) { unsigned int ret = 0; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - if (b->ops->is_extents) for_each_key(b, k, &iter) ret += KEY_SIZE(k); @@ -69,11 +67,9 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) { va_list args; struct bkey *k, *p = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; const char *err; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - for_each_key(b, k, &iter) { if (b->ops->is_extents) { err = "Keys out of order"; @@ -114,9 +110,9 @@ bug: static void bch_btree_iter_next_check(struct btree_iter *iter) { - struct bkey *k = iter->heap.data->k, *next = bkey_next(k); + struct bkey *k = iter->data->k, *next = bkey_next(k); - if (next < iter->heap.data->end && + if (next < iter->data->end && bkey_cmp(k, iter->b->ops->is_extents ? &START_KEY(next) : next) > 0) { bch_dump_bucket(iter->b); @@ -883,14 +879,12 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey preceding_key_on_stack = ZERO_KEY; struct bkey *preceding_key_p = &preceding_key_on_stack; BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* * If k has preceding key, preceding_key_p will be set to address * of k's preceding key; otherwise preceding_key_p will be set @@ -901,9 +895,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, else preceding_key(k, &preceding_key_p); - m = bch_btree_iter_init(b, &iter, preceding_key_p); + m = bch_btree_iter_stack_init(b, &iter, preceding_key_p); - if (b->ops->insert_fixup(b, k, &iter, replace_key)) + if (b->ops->insert_fixup(b, k, &iter.iter, replace_key)) return status; status = BTREE_INSERT_STATUS_INSERT; @@ -1083,94 +1077,79 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, /* Btree iterator */ -typedef bool (new_btree_iter_cmp_fn)(const void *, const void *, void *); +typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, + struct btree_iter_set); -static inline bool new_btree_iter_cmp(const void *l, const void *r, void __always_unused *args) +static inline bool btree_iter_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - const struct btree_iter_set *_l = l; - const struct btree_iter_set *_r = r; - - return bkey_cmp(_l->k, _r->k) <= 0; + return bkey_cmp(l.k, r.k) > 0; } static inline bool btree_iter_end(struct btree_iter *iter) { - return !iter->heap.nr; + return !iter->used; } void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end) { - const struct min_heap_callbacks callbacks = { - .less = new_btree_iter_cmp, - .swp = NULL, - }; - if (k != end) - BUG_ON(!min_heap_push(&iter->heap, - &((struct btree_iter_set) { k, end }), - &callbacks, - NULL)); + BUG_ON(!heap_add(iter, + ((struct btree_iter_set) { k, end }), + btree_iter_cmp)); } -static struct bkey *__bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search, - struct bset_tree *start) +static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; - iter->heap.size = ARRAY_SIZE(iter->heap.preallocated); - iter->heap.nr = 0; + iter->iter.size = ARRAY_SIZE(iter->stack_data); + iter->iter.used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->b = b; + iter->iter.b = b; #endif for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); + bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data)); } return ret; } -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, struct bkey *search) { - return __bch_btree_iter_init(b, iter, search, b->set); + return __bch_btree_iter_stack_init(b, iter, search, b->set); } static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, - new_btree_iter_cmp_fn *cmp) + btree_iter_cmp_fn *cmp) { struct btree_iter_set b __maybe_unused; struct bkey *ret = NULL; - const struct min_heap_callbacks callbacks = { - .less = cmp, - .swp = NULL, - }; if (!btree_iter_end(iter)) { bch_btree_iter_next_check(iter); - ret = iter->heap.data->k; - iter->heap.data->k = bkey_next(iter->heap.data->k); + ret = iter->data->k; + iter->data->k = bkey_next(iter->data->k); - if (iter->heap.data->k > iter->heap.data->end) { + if (iter->data->k > iter->data->end) { WARN_ONCE(1, "bset was corrupt!\n"); - iter->heap.data->k = iter->heap.data->end; + iter->data->k = iter->data->end; } - if (iter->heap.data->k == iter->heap.data->end) { - if (iter->heap.nr) { - b = min_heap_peek(&iter->heap)[0]; - min_heap_pop(&iter->heap, &callbacks, NULL); - } - } + if (iter->data->k == iter->data->end) + heap_pop(iter, b, cmp); else - min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); + heap_sift(iter, 0, cmp); } return ret; @@ -1178,7 +1157,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, struct bkey *bch_btree_iter_next(struct btree_iter *iter) { - return __bch_btree_iter_next(iter, new_btree_iter_cmp); + return __bch_btree_iter_next(iter, btree_iter_cmp); } @@ -1216,18 +1195,16 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, bool fixup, bool remove_stale) { + int i; struct bkey *k, *last = NULL; BKEY_PADDED(k) tmp; bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; - const struct min_heap_callbacks callbacks = { - .less = b->ops->sort_cmp, - .swp = NULL, - }; /* Heapify the iterator, using our comparison function */ - min_heapify_all(&iter->heap, &callbacks, NULL); + for (i = iter->used / 2 - 1; i >= 0; --i) + heap_sift(iter, i, b->ops->sort_cmp); while (!btree_iter_end(iter)) { if (b->ops->sort_fixup && fixup) @@ -1316,11 +1293,10 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; - struct btree_iter iter; + struct btree_iter_stack iter; int oldsize = bch_count_data(b); - min_heap_init(&iter.heap, NULL, MAX_BSETS); - __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); + __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned int i; @@ -1331,7 +1307,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(b, &iter, start, order, false, state); + __btree_sort(b, &iter.iter, start, order, false, state); EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } @@ -1347,13 +1323,11 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, struct bset_sort_state *state) { uint64_t start_time = local_clock(); - struct btree_iter iter; - - min_heap_init(&iter.heap, NULL, MAX_BSETS); + struct btree_iter_stack iter; - bch_btree_iter_init(b, &iter, NULL); + bch_btree_iter_stack_init(b, &iter, NULL); - btree_mergesort(b, new->set->data, &iter, false, true); + btree_mergesort(b, new->set->data, &iter.iter, false, true); bch_time_stats_update(&state->time, start_time); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index f79441acd4c1..6ee2c6a506a2 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -187,9 +187,8 @@ struct bset_tree { }; struct btree_keys_ops { - bool (*sort_cmp)(const void *l, - const void *r, - void *args); + bool (*sort_cmp)(struct btree_iter_set l, + struct btree_iter_set r); struct bkey *(*sort_fixup)(struct btree_iter *iter, struct bkey *tmp); bool (*insert_fixup)(struct btree_keys *b, @@ -313,18 +312,28 @@ enum { BTREE_INSERT_STATUS_FRONT_MERGE, }; -struct btree_iter_set { - struct bkey *k, *end; -}; - /* Btree key iteration */ struct btree_iter { + size_t size, used; #ifdef CONFIG_BCACHE_DEBUG struct btree_keys *b; #endif - MIN_HEAP_PREALLOCATED(struct btree_iter_set, btree_iter_heap, MAX_BSETS) heap; + struct btree_iter_set { + struct bkey *k, *end; + } data[]; +}; + +/* Fixed-size btree_iter that can be allocated on the stack */ + +struct btree_iter_stack { + /* Must be last as it ends in a flexible-array member. */ + TRAILING_OVERLAP(struct btree_iter, iter, data, + struct btree_iter_set stack_data[MAX_BSETS]; + ); }; +static_assert(offsetof(struct btree_iter_stack, iter.data) == + offsetof(struct btree_iter_stack, stack_data)); typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); @@ -335,9 +344,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end); -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search); +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search); struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search); @@ -352,13 +361,14 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b, return search ? __bch_bset_search(b, t, search) : t->data->start; } -#define for_each_key_filter(b, k, iter, filter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), (b), filter));) +#define for_each_key_filter(b, k, stack_iter, filter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \ + filter));) -#define for_each_key(b, k, iter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next(iter));) +#define for_each_key(b, k, stack_iter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next(&((stack_iter)->iter)));) /* Sorting */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ed40d8600656..3ed39c823826 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -36,6 +36,7 @@ #include <linux/sched/clock.h> #include <linux/rculist.h> #include <linux/delay.h> +#include <linux/sort.h> #include <trace/events/bcache.h> /* @@ -88,10 +89,9 @@ * Test module load/unload */ -#define MAX_NEED_GC 64 -#define MAX_SAVE_PRIO 72 -#define MAX_GC_TIMES 100 -#define MIN_GC_NODES 100 +#define MAX_GC_TIMES_SHIFT 7 /* 128 loops */ +#define GC_NODES_MIN 10 +#define GC_SLEEP_MS_MIN 10 #define GC_SLEEP_MS 100 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) @@ -149,19 +149,19 @@ void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; struct bset *i = btree_bset_first(b); - struct btree_iter iter; + struct btree_iter *iter; /* * c->fill_iter can allocate an iterator with more memory space * than static MAX_BSETS. * See the comment arount cache_set->fill_iter. */ - iter.heap.data = mempool_alloc(&b->c->fill_iter, GFP_NOIO); - iter.heap.size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; - iter.heap.nr = 0; + iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); + iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; + iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter.b = &b->keys; + iter->b = &b->keys; #endif if (!i->seq) @@ -199,7 +199,7 @@ void bch_btree_node_read_done(struct btree *b) if (i != b->keys.set[0].data && !i->keys) goto err; - bch_btree_iter_push(&iter, i->start, bset_bkey_last(i)); + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); b->written += set_blocks(i, block_bytes(b->c->cache)); } @@ -211,7 +211,7 @@ void bch_btree_node_read_done(struct btree *b) if (i->seq == b->keys.set[0].data->seq) goto err; - bch_btree_sort_and_fix_extents(&b->keys, &iter, &b->c->sort); + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); i = b->keys.set[0].data; err = "short btree key"; @@ -223,7 +223,7 @@ void bch_btree_node_read_done(struct btree *b) bch_bset_init_next(&b->keys, write_block(b), bset_magic(&b->c->cache->sb)); out: - mempool_free(iter.heap.data, &b->c->fill_iter); + mempool_free(iter, &b->c->fill_iter); return; err: set_btree_node_io_error(b); @@ -372,7 +372,7 @@ static void do_btree_node_write(struct btree *b) SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); - if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { + if (!bch_bio_alloc_pages(b->bio, GFP_NOWAIT)) { struct bio_vec *bv; void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); struct bvec_iter_all iter_all; @@ -559,8 +559,6 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) } } -#define cmp_int(l, r) ((l > r) - (l < r)) - #ifdef CONFIG_PROVE_LOCKING static int btree_lock_cmp_fn(const struct lockdep_map *_a, const struct lockdep_map *_b) @@ -1309,11 +1307,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) uint8_t stale = 0; unsigned int keys = 0, good_keys = 0; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct bset_tree *t; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - gc->nodes++; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { @@ -1572,11 +1568,9 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; unsigned int ret = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ret += bkey_u64s(k); @@ -1585,29 +1579,29 @@ static unsigned int btree_gc_count_keys(struct btree *b) static size_t btree_gc_min_nodes(struct cache_set *c) { - size_t min_nodes; + size_t min_nodes = GC_NODES_MIN; - /* - * Since incremental GC would stop 100ms when front - * side I/O comes, so when there are many btree nodes, - * if GC only processes constant (100) nodes each time, - * GC would last a long time, and the front side I/Os - * would run out of the buckets (since no new bucket - * can be allocated during GC), and be blocked again. - * So GC should not process constant nodes, but varied - * nodes according to the number of btree nodes, which - * realized by dividing GC into constant(100) times, - * so when there are many btree nodes, GC can process - * more nodes each time, otherwise, GC will process less - * nodes each time (but no less than MIN_GC_NODES) - */ - min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; - if (min_nodes < MIN_GC_NODES) - min_nodes = MIN_GC_NODES; + if (atomic_read(&c->search_inflight) == 0) { + size_t n = c->gc_stats.nodes >> MAX_GC_TIMES_SHIFT; + + if (min_nodes < n) + min_nodes = n; + } return min_nodes; } +static uint64_t btree_gc_sleep_ms(struct cache_set *c) +{ + uint64_t sleep_ms; + + if (atomic_read(&c->bucket_wait_cnt) > 0) + sleep_ms = GC_SLEEP_MS_MIN; + else + sleep_ms = GC_SLEEP_MS; + + return sleep_ms; +} static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) @@ -1615,18 +1609,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, int ret = 0; bool should_rewrite; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done); for (i = r; i < r + ARRAY_SIZE(r); i++) i->b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); @@ -1675,8 +1669,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); r->b = NULL; - if (atomic_read(&b->c->search_inflight) && - gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { + if (gc->nodes >= (gc->nodes_pre + btree_gc_min_nodes(b->c))) { gc->nodes_pre = gc->nodes; ret = -EAGAIN; break; @@ -1853,8 +1846,8 @@ static void bch_btree_gc(struct cache_set *c) cond_resched(); if (ret == -EAGAIN) - schedule_timeout_interruptible(msecs_to_jiffies - (GC_SLEEP_MS)); + schedule_timeout_interruptible( + msecs_to_jiffies(btree_gc_sleep_ms(c))); else if (ret) pr_warn("gc failed!\n"); } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -1921,9 +1914,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; struct bkey *k, *p = NULL; - struct btree_iter iter; - - min_heap_init(&iter.heap, NULL, MAX_BSETS); + struct btree_iter_stack iter; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(b->c, b->level, k); @@ -1931,10 +1922,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { - bch_btree_iter_init(&b->keys, &iter, NULL); + bch_btree_iter_stack_init(&b->keys, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter, &b->keys, + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad); if (k) { btree_node_prefetch(b, k); @@ -1962,7 +1953,7 @@ static int bch_btree_check_thread(void *arg) struct btree_check_info *info = arg; struct btree_check_state *check_state = info->state; struct cache_set *c = check_state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; @@ -1970,11 +1961,9 @@ static int bch_btree_check_thread(void *arg) cur_idx = prev_idx = 0; ret = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* root node keys are checked before thread created */ - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -1992,7 +1981,7 @@ static int bch_btree_check_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -2065,11 +2054,9 @@ int bch_btree_check(struct cache_set *c) int ret = 0; int i; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct btree_check_state check_state; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* check and mark root node keys */ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(c, c->root->level, k); @@ -2563,12 +2550,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, if (b->level) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad))) { ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2597,12 +2583,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, { int ret = MAP_CONTINUE; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : bcache_btree(map_keys_recurse, k, @@ -2836,7 +2822,8 @@ void bch_btree_exit(void) int __init bch_btree_init(void) { - btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0); + btree_io_wq = alloc_workqueue("bch_btree_io", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!btree_io_wq) return -ENOMEM; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 7510d1c983a5..f327456fc4e0 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(nr_segs, GFP_NOIO); if (!check) return; - bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs, - REQ_OP_READ); + bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ); check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 4b84fda1530a..d626ffcbecb9 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -33,16 +33,15 @@ static void sort_key_next(struct btree_iter *iter, i->k = bkey_next(i->k); if (i->k == i->end) - *i = iter->heap.data[--iter->heap.nr]; + *i = iter->data[--iter->used]; } -static bool new_bch_key_sort_cmp(const void *l, const void *r, void *args) +static bool bch_key_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - struct btree_iter_set *_l = (struct btree_iter_set *)l; - struct btree_iter_set *_r = (struct btree_iter_set *)r; - int64_t c = bkey_cmp(_l->k, _r->k); + int64_t c = bkey_cmp(l.k, r.k); - return !(c ? c > 0 : _l->k < _r->k); + return c ? c > 0 : l.k < r.k; } static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) @@ -239,7 +238,7 @@ static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, } const struct btree_keys_ops bch_btree_keys_ops = { - .sort_cmp = new_bch_key_sort_cmp, + .sort_cmp = bch_key_sort_cmp, .insert_fixup = bch_btree_ptr_insert_fixup, .key_invalid = bch_btree_ptr_invalid, .key_bad = bch_btree_ptr_bad, @@ -256,28 +255,22 @@ const struct btree_keys_ops bch_btree_keys_ops = { * Necessary for btree_sort_fixup() - if there are multiple keys that compare * equal in different sets, we have to process them newest to oldest. */ - -static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_unused *args) +static bool bch_extent_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - struct btree_iter_set *_l = (struct btree_iter_set *)l; - struct btree_iter_set *_r = (struct btree_iter_set *)r; - int64_t c = bkey_cmp(&START_KEY(_l->k), &START_KEY(_r->k)); + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); - return !(c ? c > 0 : _l->k < _r->k); + return c ? c > 0 : l.k < r.k; } static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, struct bkey *tmp) { - const struct min_heap_callbacks callbacks = { - .less = new_bch_extent_sort_cmp, - .swp = NULL, - }; - while (iter->heap.nr > 1) { - struct btree_iter_set *top = iter->heap.data, *i = top + 1; - - if (iter->heap.nr > 2 && - !new_bch_extent_sort_cmp(&i[0], &i[1], NULL)) + while (iter->used > 1) { + struct btree_iter_set *top = iter->data, *i = top + 1; + + if (iter->used > 2 && + bch_extent_sort_cmp(i[0], i[1])) i++; if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) @@ -285,7 +278,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, if (!KEY_SIZE(i->k)) { sort_key_next(iter, i); - min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); + heap_sift(iter, i - top, bch_extent_sort_cmp); continue; } @@ -295,7 +288,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, else bch_cut_front(top->k, i->k); - min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); + heap_sift(iter, i - top, bch_extent_sort_cmp); } else { /* can't happen because of comparison func */ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); @@ -305,7 +298,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, bch_cut_back(&START_KEY(i->k), tmp); bch_cut_front(i->k, top->k); - min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); + heap_sift(iter, 0, bch_extent_sort_cmp); return tmp; } else { @@ -625,7 +618,7 @@ static bool bch_extent_merge(struct btree_keys *bk, } const struct btree_keys_ops bch_extent_keys_ops = { - .sort_cmp = new_bch_extent_sort_cmp, + .sort_cmp = bch_extent_sort_cmp, .sort_fixup = bch_extent_sort_fixup, .insert_fixup = bch_extent_insert_fixup, .key_invalid = bch_extent_invalid, diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 020712c5203f..2386d08bf4e4 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, - meta_bucket_pages(&c->cache->sb), 0); + bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7ff14bd2feb8..144693b7c46a 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -275,8 +275,7 @@ bsearch: * ja->cur_idx */ ja->cur_idx = i; - ja->last_idx = ja->discard_idx = (i + 1) % - ca->sb.njournal_buckets; + ja->last_idx = (i + 1) % ca->sb.njournal_buckets; } @@ -336,16 +335,6 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } } -static bool is_discard_enabled(struct cache_set *s) -{ - struct cache *ca = s->cache; - - if (ca->discard) - return true; - - return false; -} - int bch_journal_replay(struct cache_set *s, struct list_head *list) { int ret = 0, keys = 0, entries = 0; @@ -360,15 +349,10 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) BUG_ON(i->pin && atomic_read(i->pin) != 1); if (n != i->j.seq) { - if (n == start && is_discard_enabled(s)) - pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); - else { - pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); - ret = -EIO; - goto err; - } + pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", + n, i->j.seq - 1, start, end); + ret = -EIO; + goto err; } for (k = i->j.start; @@ -568,65 +552,6 @@ out: #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) -static void journal_discard_endio(struct bio *bio) -{ - struct journal_device *ja = - container_of(bio, struct journal_device, discard_bio); - struct cache *ca = container_of(ja, struct cache, journal); - - atomic_set(&ja->discard_in_flight, DISCARD_DONE); - - closure_wake_up(&ca->set->journal.wait); - closure_put(&ca->set->cl); -} - -static void journal_discard_work(struct work_struct *work) -{ - struct journal_device *ja = - container_of(work, struct journal_device, discard_work); - - submit_bio(&ja->discard_bio); -} - -static void do_journal_discard(struct cache *ca) -{ - struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->discard_bio; - - if (!ca->discard) { - ja->discard_idx = ja->last_idx; - return; - } - - switch (atomic_read(&ja->discard_in_flight)) { - case DISCARD_IN_FLIGHT: - return; - - case DISCARD_DONE: - ja->discard_idx = (ja->discard_idx + 1) % - ca->sb.njournal_buckets; - - atomic_set(&ja->discard_in_flight, DISCARD_READY); - fallthrough; - - case DISCARD_READY: - if (ja->discard_idx == ja->last_idx) - return; - - atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - - bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD); - bio->bi_iter.bi_sector = bucket_to_sector(ca->set, - ca->sb.d[ja->discard_idx]); - bio->bi_iter.bi_size = bucket_bytes(ca); - bio->bi_end_io = journal_discard_endio; - - closure_get(&ca->set->cl); - INIT_WORK(&ja->discard_work, journal_discard_work); - queue_work(bch_journal_wq, &ja->discard_work); - } -} - static unsigned int free_journal_buckets(struct cache_set *c) { struct journal *j = &c->journal; @@ -635,10 +560,10 @@ static unsigned int free_journal_buckets(struct cache_set *c) unsigned int n; /* In case njournal_buckets is not power of 2 */ - if (ja->cur_idx >= ja->discard_idx) - n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; + if (ja->cur_idx >= ja->last_idx) + n = ca->sb.njournal_buckets + ja->last_idx - ja->cur_idx; else - n = ja->discard_idx - ja->cur_idx; + n = ja->last_idx - ja->cur_idx; if (n > (1 + j->do_reserve)) return n - (1 + j->do_reserve); @@ -668,8 +593,6 @@ static void journal_reclaim(struct cache_set *c) ja->last_idx = (ja->last_idx + 1) % ca->sb.njournal_buckets; - do_journal_discard(ca); - if (c->journal.blocks_free) goto out; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index cd316b4a1e95..9e9d1b3016a5 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -139,19 +139,6 @@ struct journal_device { /* Last journal bucket that still contains an open journal entry */ unsigned int last_idx; - /* Next journal bucket to be discarded */ - unsigned int discard_idx; - -#define DISCARD_READY 0 -#define DISCARD_IN_FLIGHT 1 -#define DISCARD_DONE 2 - /* 1 - discard in flight, -1 - discard completed */ - atomic_t discard_in_flight; - - struct work_struct discard_work; - struct bio discard_bio; - struct bio_vec discard_bv; - /* Bio for journal reads/writes to this device */ struct bio bio; struct bio_vec bv[8]; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index ef6abf33f926..73918e55bf04 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -79,10 +79,10 @@ static void moving_init(struct moving_io *io) { struct bio *bio = &io->bio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); bio_get(bio); - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9; bio->bi_private = &io->cl; @@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c) continue; } - io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; @@ -182,19 +182,16 @@ err: if (!IS_ERR_OR_NULL(w->private)) closure_sync(&cl); } -static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *args) +static bool bucket_cmp(struct bucket *l, struct bucket *r) { - struct bucket **_l = (struct bucket **)l; - struct bucket **_r = (struct bucket **)r; - - return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r); + return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); } static unsigned int bucket_heap_top(struct cache *ca) { struct bucket *b; - return (b = min_heap_peek(&ca->heap)[0]) ? GC_SECTORS_USED(b) : 0; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; } void bch_moving_gc(struct cache_set *c) @@ -202,10 +199,6 @@ void bch_moving_gc(struct cache_set *c) struct cache *ca = c->cache; struct bucket *b; unsigned long sectors_to_move, reserve_sectors; - const struct min_heap_callbacks callbacks = { - .less = new_bucket_cmp, - .swp = NULL, - }; if (!c->copy_gc_enabled) return; @@ -216,7 +209,7 @@ void bch_moving_gc(struct cache_set *c) reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); - ca->heap.nr = 0; + ca->heap.used = 0; for_each_bucket(b, ca) { if (GC_MARK(b) == GC_MARK_METADATA || @@ -225,31 +218,25 @@ void bch_moving_gc(struct cache_set *c) atomic_read(&b->pin)) continue; - if (!min_heap_full(&ca->heap)) { + if (!heap_full(&ca->heap)) { sectors_to_move += GC_SECTORS_USED(b); - min_heap_push(&ca->heap, &b, &callbacks, NULL); - } else if (!new_bucket_cmp(&b, min_heap_peek(&ca->heap), ca)) { + heap_add(&ca->heap, b, bucket_cmp); + } else if (bucket_cmp(b, heap_peek(&ca->heap))) { sectors_to_move -= bucket_heap_top(ca); sectors_to_move += GC_SECTORS_USED(b); ca->heap.data[0] = b; - min_heap_sift_down(&ca->heap, 0, &callbacks, NULL); + heap_sift(&ca->heap, 0, bucket_cmp); } } while (sectors_to_move > reserve_sectors) { - if (ca->heap.nr) { - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &callbacks, NULL); - } + heap_pop(&ca->heap, b, bucket_cmp); sectors_to_move -= GC_SECTORS_USED(b); } - while (ca->heap.nr) { - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &callbacks, NULL); + while (heap_pop(&ca->heap, b, bucket_cmp)) SET_GC_MOVE(b, 1); - } mutex_unlock(&c->bucket_lock); diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 68b02216033d..0056106495a7 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -123,7 +123,7 @@ void bch_cache_accounting_destroy(struct cache_accounting *acc) kobject_put(&acc->day.kobj); atomic_set(&acc->closing, 1); - if (del_timer_sync(&acc->timer)) + if (timer_delete_sync(&acc->timer)) closure_return(&acc->cl); } @@ -149,7 +149,7 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) static void scale_accounting(struct timer_list *t) { - struct cache_accounting *acc = from_timer(acc, t, timer); + struct cache_accounting *acc = timer_container_of(acc, t, timer); #define move_stat(name) do { \ unsigned int t = atomic_xchg(&acc->collector.name, 0); \ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e7abfdd77c3b..c17d4517af22 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -168,14 +168,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, { const char *err; struct cache_sb_disk *s; - struct page *page; + struct folio *folio; unsigned int i; - page = read_cache_page_gfp(bdev->bd_mapping, - SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); - if (IS_ERR(page)) + folio = mapping_read_folio_gfp(bdev->bd_mapping, + SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); + if (IS_ERR(folio)) return "IO error"; - s = page_address(page) + offset_in_page(SB_OFFSET); + s = folio_address(folio) + offset_in_folio(folio, SB_OFFSET); sb->offset = le64_to_cpu(s->offset); sb->version = le64_to_cpu(s->version); @@ -272,7 +272,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, *res = s; return NULL; err: - put_page(page); + folio_put(folio); return err; } @@ -293,8 +293,7 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META; bio->bi_iter.bi_sector = SB_SECTOR; - __bio_add_page(bio, virt_to_page(out), SB_SIZE, - offset_in_page(out)); + bio_add_virt_nofail(bio, out, SB_SIZE); out->offset = cpu_to_le64(sb->offset); @@ -546,7 +545,8 @@ static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) static struct uuid_entry *uuid_find_empty(struct cache_set *c) { - static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + static const char zero_uuid[16] __nonstring = + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; return uuid_find(c, zero_uuid); } @@ -1366,7 +1366,7 @@ static CLOSURE_CALLBACK(cached_dev_free) mutex_unlock(&bch_register_lock); if (dc->sb_disk) - put_page(virt_to_page(dc->sb_disk)); + folio_put(virt_to_folio(dc->sb_disk)); if (dc->bdev_file) fput(dc->bdev_file); @@ -1388,7 +1388,7 @@ static CLOSURE_CALLBACK(cached_dev_flush) bch_cache_accounting_destroy(&dc->accounting); kobject_del(&d->kobj); - continue_at(cl, cached_dev_free, system_wq); + continue_at(cl, cached_dev_free, system_percpu_wq); } static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) @@ -1400,7 +1400,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) __module_get(THIS_MODULE); INIT_LIST_HEAD(&dc->list); closure_init(&dc->disk.cl, NULL); - set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); + set_closure_fn(&dc->disk.cl, cached_dev_flush, system_percpu_wq); kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); INIT_WORK(&dc->detach, cached_dev_detach_finish); sema_init(&dc->sb_write_mutex, 1); @@ -1513,7 +1513,7 @@ static CLOSURE_CALLBACK(flash_dev_flush) bcache_device_unlink(d); mutex_unlock(&bch_register_lock); kobject_del(&d->kobj); - continue_at(cl, flash_dev_free, system_wq); + continue_at(cl, flash_dev_free, system_percpu_wq); } static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) @@ -1525,7 +1525,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) goto err_ret; closure_init(&d->cl, NULL); - set_closure_fn(&d->cl, flash_dev_flush, system_wq); + set_closure_fn(&d->cl, flash_dev_flush, system_percpu_wq); kobject_init(&d->kobj, &bch_flash_dev_ktype); @@ -1718,7 +1718,7 @@ static CLOSURE_CALLBACK(cache_set_flush) if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); - if (!IS_ERR(c->root)) + if (!IS_ERR_OR_NULL(c->root)) list_add(&c->root->list, &c->btree_cache); /* @@ -1733,7 +1733,12 @@ static CLOSURE_CALLBACK(cache_set_flush) mutex_unlock(&b->write_lock); } - if (ca->alloc_thread) + /* + * If the register_cache_set() call to bch_cache_set_alloc() failed, + * ca has not been assigned a value and return error. + * So we need check ca is not NULL during bch_cache_set_unregister(). + */ + if (ca && ca->alloc_thread) kthread_stop(ca->alloc_thread); if (c->journal.cur) { @@ -1828,7 +1833,7 @@ static CLOSURE_CALLBACK(__cache_set_unregister) mutex_unlock(&bch_register_lock); - continue_at(cl, cache_set_flush, system_wq); + continue_at(cl, cache_set_flush, system_percpu_wq); } void bch_cache_set_stop(struct cache_set *c) @@ -1858,10 +1863,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) __module_get(THIS_MODULE); closure_init(&c->cl, NULL); - set_closure_fn(&c->cl, cache_set_free, system_wq); + set_closure_fn(&c->cl, cache_set_free, system_percpu_wq); closure_init(&c->caching, &c->cl); - set_closure_fn(&c->caching, __cache_set_unregister, system_wq); + set_closure_fn(&c->caching, __cache_set_unregister, system_percpu_wq); /* Maybe create continue_at_noreturn() and use it here? */ closure_set_stopped(&c->cl); @@ -1907,7 +1912,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * + iter_size = sizeof(struct btree_iter) + + ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * sizeof(struct btree_iter_set); c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); @@ -1933,7 +1939,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) if (!c->uuids) goto err; - c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0); + c->moving_gc_wq = alloc_workqueue("bcache_gc", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!c->moving_gc_wq) goto err; @@ -2210,7 +2217,7 @@ void bch_cache_release(struct kobject *kobj) free_fifo(&ca->free[i]); if (ca->sb_disk) - put_page(virt_to_page(ca->sb_disk)); + folio_put(virt_to_folio(ca->sb_disk)); if (ca->bdev_file) fput(ca->bdev_file); @@ -2230,18 +2237,50 @@ static int cache_alloc(struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0); + bio_init_inline(&ca->journal.bio, NULL, 8, 0); /* - * when ca->sb.njournal_buckets is not zero, journal exists, - * and in bch_journal_replay(), tree node may split, - * so bucket of RESERVE_BTREE type is needed, - * the worst situation is all journal buckets are valid journal, - * and all the keys need to replay, - * so the number of RESERVE_BTREE type buckets should be as much - * as journal buckets + * When the cache disk is first registered, ca->sb.njournal_buckets + * is zero, and it is assigned in run_cache_set(). + * + * When ca->sb.njournal_buckets is not zero, journal exists, + * and in bch_journal_replay(), tree node may split. + * The worst situation is all journal buckets are valid journal, + * and all the keys need to replay, so the number of RESERVE_BTREE + * type buckets should be as much as journal buckets. + * + * If the number of RESERVE_BTREE type buckets is too few, the + * bch_allocator_thread() may hang up and unable to allocate + * bucket. The situation is roughly as follows: + * + * 1. In bch_data_insert_keys(), if the operation is not op->replace, + * it will call the bch_journal(), which increments the journal_ref + * counter. This counter is only decremented after bch_btree_insert + * completes. + * + * 2. When calling bch_btree_insert, if the btree needs to split, + * it will call btree_split() and btree_check_reserve() to check + * whether there are enough reserved buckets in the RESERVE_BTREE + * slot. If not enough, bcache_btree_root() will repeatedly retry. + * + * 3. Normally, the bch_allocator_thread is responsible for filling + * the reservation slots from the free_inc bucket list. When the + * free_inc bucket list is exhausted, the bch_allocator_thread + * will call invalidate_buckets() until free_inc is refilled. + * Then bch_allocator_thread calls bch_prio_write() once. and + * bch_prio_write() will call bch_journal_meta() and waits for + * the journal write to complete. + * + * 4. During journal_write, journal_write_unlocked() is be called. + * If journal full occurs, journal_reclaim() and btree_flush_write() + * will be called sequentially, then retry journal_write. + * + * 5. When 2 and 4 occur together, IO will hung up and cannot recover. + * + * Therefore, reserve more RESERVE_BTREE type buckets. */ - btree_buckets = ca->sb.njournal_buckets ?: 8; + btree_buckets = clamp_t(size_t, ca->sb.nbuckets >> 7, + 32, SB_JOURNAL_BUCKETS); free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; if (!free) { ret = -EPERM; @@ -2344,9 +2383,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, ca->bdev = file_bdev(bdev_file); ca->sb_disk = sb_disk; - if (bdev_max_discard_sectors(file_bdev(bdev_file))) - ca->discard = CACHE_DISCARD(&ca->sb); - ret = cache_alloc(ca); if (ret != 0) { if (ret == -ENOMEM) @@ -2493,7 +2529,7 @@ static void register_device_async(struct async_reg_args *args) INIT_DELAYED_WORK(&args->reg_work, register_cache_worker); /* 10 jiffies is enough for a delay */ - queue_delayed_work(system_wq, &args->reg_work, 10); + queue_delayed_work(system_percpu_wq, &args->reg_work, 10); } static void *alloc_holder_object(struct cache_sb *sb) @@ -2555,7 +2591,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!holder) { ret = -ENOMEM; err = "cannot allocate memory"; - goto out_put_sb_page; + goto out_put_sb_folio; } /* Now reopen in exclusive mode with proper holder */ @@ -2629,8 +2665,8 @@ async_done: out_free_holder: kfree(holder); -out_put_sb_page: - put_page(virt_to_page(sb_disk)); +out_put_sb_folio: + folio_put(virt_to_folio(sb_disk)); out_blkdev_put: if (bdev_file) fput(bdev_file); @@ -2867,24 +2903,25 @@ static int __init bcache_init(void) if (bch_btree_init()) goto err; - bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!bcache_wq) goto err; /* * Let's not make this `WQ_MEM_RECLAIM` for the following reasons: * - * 1. It used `system_wq` before which also does no memory reclaim. + * 1. It used `system_percpu_wq` before which also does no memory reclaim. * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and * reduced throughput can be observed. * - * We still want to user our own queue to not congest the `system_wq`. + * We still want to user our own queue to not congest the `system_percpu_wq`. */ - bch_flush_wq = alloc_workqueue("bch_flush", 0, 0); + bch_flush_wq = alloc_workqueue("bch_flush", WQ_PERCPU, 0); if (!bch_flush_wq) goto err; - bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0); + bch_journal_wq = alloc_workqueue("bch_journal", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!bch_journal_wq) goto err; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index e8f696cb58c0..72f38e5b6f5c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -134,7 +134,6 @@ read_attribute(partial_stripes_expensive); rw_attribute(synchronous); rw_attribute(journal_delay_ms); rw_attribute(io_disable); -rw_attribute(discard); rw_attribute(running); rw_attribute(label); rw_attribute(errors); @@ -660,9 +659,7 @@ static unsigned int bch_root_usage(struct cache_set *c) unsigned int bytes = 0; struct bkey *k; struct btree *b; - struct btree_iter iter; - - min_heap_init(&iter.heap, NULL, MAX_BSETS); + struct btree_iter_stack iter; goto lock_root; @@ -1038,7 +1035,6 @@ SHOW(__bch_cache) sysfs_hprint(bucket_size, bucket_bytes(ca)); sysfs_hprint(block_size, block_bytes(ca)); sysfs_print(nbuckets, ca->sb.nbuckets); - sysfs_print(discard, ca->discard); sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); sysfs_hprint(btree_written, atomic_long_read(&ca->btree_sectors_written) << 9); @@ -1144,18 +1140,6 @@ STORE(__bch_cache) if (bcache_is_reboot) return -EBUSY; - if (attr == &sysfs_discard) { - bool v = strtoul_or_return(buf); - - if (bdev_max_discard_sectors(ca->bdev)) - ca->discard = v; - - if (v != CACHE_DISCARD(&ca->sb)) { - SET_CACHE_DISCARD(&ca->sb, v); - bcache_write_super(ca->set); - } - } - if (attr == &sysfs_cache_replacement_policy) { v = __sysfs_match_string(cache_replacement_policies, -1, buf); if (v < 0) @@ -1187,7 +1171,6 @@ static struct attribute *bch_cache_attrs[] = { &sysfs_block_size, &sysfs_nbuckets, &sysfs_priority_stats, - &sysfs_discard, &sysfs_written, &sysfs_btree_written, &sysfs_metadata_written, diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 539454d8e2d0..f61ab1bada6c 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -9,7 +9,6 @@ #include <linux/kernel.h> #include <linux/sched/clock.h> #include <linux/llist.h> -#include <linux/min_heap.h> #include <linux/ratelimit.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> @@ -31,10 +30,16 @@ struct closure; #endif +#define DECLARE_HEAP(type, name) \ + struct { \ + size_t size, used; \ + type *data; \ + } name + #define init_heap(heap, _size, gfp) \ ({ \ size_t _bytes; \ - (heap)->nr = 0; \ + (heap)->used = 0; \ (heap)->size = (_size); \ _bytes = (heap)->size * sizeof(*(heap)->data); \ (heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \ @@ -47,6 +52,64 @@ do { \ (heap)->data = NULL; \ } while (0) +#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) + +#define heap_sift(h, i, cmp) \ +do { \ + size_t _r, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _r) { \ + _r = _j * 2 + 1; \ + if (_r + 1 < (h)->used && \ + cmp((h)->data[_r], (h)->data[_r + 1])) \ + _r++; \ + \ + if (cmp((h)->data[_r], (h)->data[_j])) \ + break; \ + heap_swap(h, _r, _j); \ + } \ +} while (0) + +#define heap_sift_down(h, i, cmp) \ +do { \ + while (i) { \ + size_t p = (i - 1) / 2; \ + if (cmp((h)->data[i], (h)->data[p])) \ + break; \ + heap_swap(h, i, p); \ + i = p; \ + } \ +} while (0) + +#define heap_add(h, d, cmp) \ +({ \ + bool _r = !heap_full(h); \ + if (_r) { \ + size_t _i = (h)->used++; \ + (h)->data[_i] = d; \ + \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ + } \ + _r; \ +}) + +#define heap_pop(h, d, cmp) \ +({ \ + bool _r = (h)->used; \ + if (_r) { \ + (d) = (h)->data[0]; \ + (h)->used--; \ + heap_swap(h, 0, (h)->used); \ + heap_sift(h, 0, cmp); \ + } \ + _r; \ +}) + +#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) + +#define heap_full(h) ((h)->used == (h)->size) + #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index c1d28e365910..4b237074f453 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -331,10 +331,10 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; bio->bi_private = w; @@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(struct_size(io, bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; @@ -805,8 +805,7 @@ static int bch_writeback_thread(void *arg) * may set BCH_ENABLE_AUTO_GC via sysfs, then when * BCH_DO_AUTO_GC is set, garbage collection thread * will be wake up here. After moving gc, the shrunk - * btree and discarded free buckets SSD space may be - * helpful for following write requests. + * btree may be helpful for following write requests. */ if (c->gc_after_writeback == (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { @@ -908,16 +907,15 @@ static int bch_dirty_init_thread(void *arg) struct dirty_init_thrd_info *info = arg; struct bch_dirty_init_state *state = info->state; struct cache_set *c = state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; k = p = NULL; prev_idx = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -931,7 +929,7 @@ static int bch_dirty_init_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -980,13 +978,11 @@ void bch_sectors_dirty_init(struct bcache_device *d) int i; struct btree *b = NULL; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct sectors_dirty_init op; struct cache_set *c = d->c; struct bch_dirty_init_state state; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - retry_lock: b = c->root; rw_lock(0, b, b->level); @@ -1079,7 +1075,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) int bch_cached_dev_writeback_start(struct cached_dev *dc) { dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!dc->writeback_write_wq) return -ENOMEM; diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index aab8240429b0..e6d28be11c5c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -41,16 +41,6 @@ #define DM_BUFIO_LOW_WATERMARK_RATIO 16 /* - * Check buffer ages in this interval (seconds) - */ -#define DM_BUFIO_WORK_TIMER_SECS 30 - -/* - * Free buffers when they are older than this (seconds) - */ -#define DM_BUFIO_DEFAULT_AGE_SECS 300 - -/* * The nr of bytes of cached data to keep around. */ #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) @@ -68,6 +58,8 @@ #define LIST_DIRTY 1 #define LIST_SIZE 2 +#define SCAN_RESCHED_CYCLE 16 + /*--------------------------------------------------------------*/ /* @@ -1055,10 +1047,8 @@ static unsigned long dm_bufio_cache_size_latch; static DEFINE_SPINLOCK(global_spinlock); -/* - * Buffers are freed after this timeout - */ -static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; +static unsigned int dm_bufio_max_age; /* No longer does anything */ + static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; static unsigned long dm_bufio_peak_allocated; @@ -1086,7 +1076,6 @@ static LIST_HEAD(dm_bufio_all_clients); static DEFINE_MUTEX(dm_bufio_clients_lock); static struct workqueue_struct *dm_bufio_wq; -static struct delayed_work dm_bufio_cleanup_old_work; static struct work_struct dm_bufio_replacement_work; @@ -1348,12 +1337,12 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, char *ptr; unsigned int len; - bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN); + bio = bio_kmalloc(1, GFP_NOWAIT); if (!bio) { use_dmio(b, op, sector, n_sectors, offset, ioprio); return; } - bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op); + bio_init_inline(bio, b->c->bdev, 1, op); bio->bi_iter.bi_sector = sector; bio->bi_end_io = bio_complete; bio->bi_private = b; @@ -1362,7 +1351,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, ptr = (char *)b->data + offset; len = n_sectors << SECTOR_SHIFT; - __bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr)); + bio_add_virt_nofail(bio, ptr, len); submit_bio(bio); } @@ -1612,18 +1601,18 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client * dm-bufio is resistant to allocation failures (it just keeps * one buffer reserved in cases all the allocations fail). * So set flags to not try too hard: - * GFP_NOWAIT: don't wait; if we need to sleep we'll release our - * mutex and wait ourselves. + * GFP_NOWAIT: don't wait and don't print a warning in case of + * failure; if we need to sleep we'll release our mutex + * and wait ourselves. * __GFP_NORETRY: don't retry and rather return failure * __GFP_NOMEMALLOC: don't use emergency reserves - * __GFP_NOWARN: don't print a warning in case of failure * * For debugging, if we set the cache size to 1, no new buffers will * be allocated. */ while (1) { if (dm_bufio_cache_size_latch != 1) { - b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC); if (b) return b; } @@ -2234,7 +2223,7 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c } EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); -static bool forget_buffer(struct dm_bufio_client *c, sector_t block) +static void forget_buffer(struct dm_bufio_client *c, sector_t block) { struct dm_buffer *b; @@ -2249,8 +2238,6 @@ static bool forget_buffer(struct dm_bufio_client *c, sector_t block) cache_put_and_wake(c, b); } } - - return b ? true : false; } /* @@ -2426,7 +2413,12 @@ static void __scan(struct dm_bufio_client *c) atomic_long_dec(&c->need_shrink); freed++; - cond_resched(); + + if (unlikely(freed % SCAN_RESCHED_CYCLE == 0)) { + dm_bufio_unlock(c); + cond_resched(); + dm_bufio_lock(c); + } } } } @@ -2675,130 +2667,6 @@ EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset); /*--------------------------------------------------------------*/ -static unsigned int get_max_age_hz(void) -{ - unsigned int max_age = READ_ONCE(dm_bufio_max_age); - - if (max_age > UINT_MAX / HZ) - max_age = UINT_MAX / HZ; - - return max_age * HZ; -} - -static bool older_than(struct dm_buffer *b, unsigned long age_hz) -{ - return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz); -} - -struct evict_params { - gfp_t gfp; - unsigned long age_hz; - - /* - * This gets updated with the largest last_accessed (ie. most - * recently used) of the evicted buffers. It will not be reinitialised - * by __evict_many(), so you can use it across multiple invocations. - */ - unsigned long last_accessed; -}; - -/* - * We may not be able to evict this buffer if IO pending or the client - * is still using it. - * - * And if GFP_NOFS is used, we must not do any I/O because we hold - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets - * rerouted to different bufio client. - */ -static enum evict_result select_for_evict(struct dm_buffer *b, void *context) -{ - struct evict_params *params = context; - - if (!(params->gfp & __GFP_FS) || - (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) { - if (test_bit_acquire(B_READING, &b->state) || - test_bit(B_WRITING, &b->state) || - test_bit(B_DIRTY, &b->state)) - return ER_DONT_EVICT; - } - - return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP; -} - -static unsigned long __evict_many(struct dm_bufio_client *c, - struct evict_params *params, - int list_mode, unsigned long max_count) -{ - unsigned long count; - unsigned long last_accessed; - struct dm_buffer *b; - - for (count = 0; count < max_count; count++) { - b = cache_evict(&c->cache, list_mode, select_for_evict, params); - if (!b) - break; - - last_accessed = READ_ONCE(b->last_accessed); - if (time_after_eq(params->last_accessed, last_accessed)) - params->last_accessed = last_accessed; - - __make_buffer_clean(b); - __free_buffer_wake(b); - - cond_resched(); - } - - return count; -} - -static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) -{ - struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0}; - unsigned long retain = get_retain_buffers(c); - unsigned long count; - LIST_HEAD(write_list); - - dm_bufio_lock(c); - - __check_watermark(c, &write_list); - if (unlikely(!list_empty(&write_list))) { - dm_bufio_unlock(c); - __flush_write_list(&write_list); - dm_bufio_lock(c); - } - - count = cache_total(&c->cache); - if (count > retain) - __evict_many(c, ¶ms, LIST_CLEAN, count - retain); - - dm_bufio_unlock(c); -} - -static void cleanup_old_buffers(void) -{ - unsigned long max_age_hz = get_max_age_hz(); - struct dm_bufio_client *c; - - mutex_lock(&dm_bufio_clients_lock); - - __cache_size_refresh(); - - list_for_each_entry(c, &dm_bufio_all_clients, client_list) - evict_old_buffers(c, max_age_hz); - - mutex_unlock(&dm_bufio_clients_lock); -} - -static void work_fn(struct work_struct *w) -{ - cleanup_old_buffers(); - - queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, - DM_BUFIO_WORK_TIMER_SECS * HZ); -} - -/*--------------------------------------------------------------*/ - /* * Global cleanup tries to evict the oldest buffers from across _all_ * the clients. It does this by repeatedly evicting a few buffers from @@ -2836,27 +2704,55 @@ static void __insert_client(struct dm_bufio_client *new_client) list_add_tail(&new_client->client_list, h); } +static enum evict_result select_for_evict(struct dm_buffer *b, void *context) +{ + /* In no-sleep mode, we cannot wait on IO. */ + if (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep) { + if (test_bit_acquire(B_READING, &b->state) || + test_bit(B_WRITING, &b->state) || + test_bit(B_DIRTY, &b->state)) + return ER_DONT_EVICT; + } + return ER_EVICT; +} + static unsigned long __evict_a_few(unsigned long nr_buffers) { - unsigned long count; struct dm_bufio_client *c; - struct evict_params params = { - .gfp = GFP_KERNEL, - .age_hz = 0, - /* set to jiffies in case there are no buffers in this client */ - .last_accessed = jiffies - }; + unsigned long oldest_buffer = jiffies; + unsigned long last_accessed; + unsigned long count; + struct dm_buffer *b; c = __pop_client(); if (!c) return 0; dm_bufio_lock(c); - count = __evict_many(c, ¶ms, LIST_CLEAN, nr_buffers); + + for (count = 0; count < nr_buffers; count++) { + b = cache_evict(&c->cache, LIST_CLEAN, select_for_evict, NULL); + if (!b) + break; + + last_accessed = READ_ONCE(b->last_accessed); + if (time_after_eq(oldest_buffer, last_accessed)) + oldest_buffer = last_accessed; + + __make_buffer_clean(b); + __free_buffer_wake(b); + + if (need_resched()) { + dm_bufio_unlock(c); + cond_resched(); + dm_bufio_lock(c); + } + } + dm_bufio_unlock(c); if (count) - c->oldest_buffer = params.last_accessed; + c->oldest_buffer = oldest_buffer; __insert_client(c); return count; @@ -2939,10 +2835,7 @@ static int __init dm_bufio_init(void) if (!dm_bufio_wq) return -ENOMEM; - INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn); INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup); - queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, - DM_BUFIO_WORK_TIMER_SECS * HZ); return 0; } @@ -2954,7 +2847,6 @@ static void __exit dm_bufio_exit(void) { int bug = 0; - cancel_delayed_work_sync(&dm_bufio_cleanup_old_work); destroy_workqueue(dm_bufio_wq); if (dm_bufio_client_count) { @@ -2991,7 +2883,7 @@ module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644); MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644); -MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); +MODULE_PARM_DESC(max_age_seconds, "No longer does anything"); module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644); MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 2ed894155cab..7e1e8cc0e33a 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -590,7 +590,7 @@ static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned in nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u)); ht->hash_bits = __ffs(nr_buckets); - ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets))); + ht->buckets = vmalloc_array(nr_buckets, sizeof(*ht->buckets)); if (!ht->buckets) return -ENOMEM; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 9cb797a561d6..a10d75a562db 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -406,6 +406,12 @@ struct cache { mempool_t migration_pool; struct bio_set bs; + + /* + * Cache_size entries. Set bits indicate blocks mapped beyond the + * target length, which are marked for invalidation. + */ + unsigned long *invalid_bitset; }; struct per_bio_data { @@ -1922,6 +1928,9 @@ static void __destroy(struct cache *cache) if (cache->discard_bitset) free_bitset(cache->discard_bitset); + if (cache->invalid_bitset) + free_bitset(cache->invalid_bitset); + if (cache->copier) dm_kcopyd_client_destroy(cache->copier); @@ -2510,6 +2519,13 @@ static int cache_create(struct cache_args *ca, struct cache **result) } clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); + cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size)); + if (!cache->invalid_bitset) { + *error = "could not allocate bitset for invalid blocks"; + goto bad; + } + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); + cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(cache->copier)) { *error = "could not create kcopyd client"; @@ -2808,6 +2824,24 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); } +static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) +{ + struct cache *cache = context; + + if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) { + if (dirty) { + DMERR("%s: unable to shrink origin; cache block %u is dirty", + cache_device_name(cache), from_cblock(cblock)); + return -EFBIG; + } + set_bit(from_cblock(cblock), cache->invalid_bitset); + return 0; + } + + return load_mapping(context, oblock, cblock, dirty, hint, hint_valid); +} + /* * The discard block size in the on disk metadata is not * necessarily the same as we're currently using. So we have to @@ -2899,6 +2933,27 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache) return to_cblock(size); } +static bool can_resume(struct cache *cache) +{ + /* + * Disallow retrying the resume operation for devices that failed the + * first resume attempt, as the failure leaves the policy object partially + * initialized. Retrying could trigger BUG_ON when loading cache mappings + * into the incomplete policy object. + */ + if (cache->sized && !cache->loaded_mappings) { + if (get_cache_mode(cache) != CM_WRITE) + DMERR("%s: unable to resume a failed-loaded cache, please check metadata.", + cache_device_name(cache)); + else + DMERR("%s: unable to resume cache due to missing proper cache table reload", + cache_device_name(cache)); + return false; + } + + return true; +} + static bool can_resize(struct cache *cache, dm_cblock_t new_size) { if (from_cblock(new_size) > from_cblock(cache->cache_size)) { @@ -2941,12 +2996,33 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) return 0; } +static int truncate_oblocks(struct cache *cache) +{ + uint32_t nr_blocks = from_cblock(cache->cache_size); + uint32_t i; + int r; + + for_each_set_bit(i, cache->invalid_bitset, nr_blocks) { + r = dm_cache_remove_mapping(cache->cmd, to_cblock(i)); + if (r) { + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", + cache_device_name(cache)); + return r; + } + } + + return 0; +} + static int cache_preresume(struct dm_target *ti) { int r = 0; struct cache *cache = ti->private; dm_cblock_t csize = get_cache_dev_size(cache); + if (!can_resume(cache)) + return -EINVAL; + /* * Check to see if the cache has resized. */ @@ -2962,11 +3038,25 @@ static int cache_preresume(struct dm_target *ti) } if (!cache->loaded_mappings) { + /* + * The fast device could have been resized since the last + * failed preresume attempt. To be safe we start by a blank + * bitset for cache blocks. + */ + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); + r = dm_cache_load_mappings(cache->cmd, cache->policy, - load_mapping, cache); + load_filtered_mapping, cache); if (r) { DMERR("%s: could not load cache mappings", cache_device_name(cache)); - metadata_operation_failed(cache, "dm_cache_load_mappings", r); + if (r != -EFBIG) + metadata_operation_failed(cache, "dm_cache_load_mappings", r); + return r; + } + + r = truncate_oblocks(cache); + if (r) { + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); return r; } @@ -3426,7 +3516,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {2, 2, 0}, + .version = {2, 3, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 3637761f3585..a3c9f74fe2dc 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -141,6 +141,7 @@ struct mapped_device { #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_zones; void *zone_revalidate_map; + struct task_struct *revalidate_map_task; #endif #ifdef CONFIG_IMA @@ -161,9 +162,7 @@ struct mapped_device { #define DMF_SUSPENDED_INTERNALLY 7 #define DMF_POST_SUSPENDING 8 #define DMF_EMULATE_ZONE_APPEND 9 - -void disable_discard(struct mapped_device *md); -void disable_write_zeroes(struct mapped_device *md); +#define DMF_QUEUE_STOPPED 10 static inline sector_t dm_get_size(struct mapped_device *md) { @@ -293,6 +292,7 @@ struct dm_io { struct dm_io *next; struct dm_stats_aux stats_aux; blk_status_t status; + bool requeue_flush_with_data; atomic_t io_count; struct mapped_device *md; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1ae2c71bb383..5ef43231fe77 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -17,6 +17,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> +#include <linux/crc32.h> #include <linux/mempool.h> #include <linux/slab.h> #include <linux/crypto.h> @@ -59,6 +60,7 @@ struct convert_context { struct bio *bio_out; struct bvec_iter iter_out; atomic_t cc_pending; + unsigned int tag_offset; u64 cc_sector; union { struct skcipher_request *req; @@ -124,7 +126,6 @@ struct iv_lmk_private { #define TCW_WHITENING_SIZE 16 struct iv_tcw_private { - struct crypto_shash *crc32_tfm; u8 *iv_seed; u8 *whitening; }; @@ -252,17 +253,35 @@ MODULE_PARM_DESC(max_read_size, "Maximum size of a read request"); static unsigned int max_write_size = 0; module_param(max_write_size, uint, 0644); MODULE_PARM_DESC(max_write_size, "Maximum size of a write request"); -static unsigned get_max_request_size(struct crypt_config *cc, bool wrt) + +static unsigned get_max_request_sectors(struct dm_target *ti, struct bio *bio) { + struct crypt_config *cc = ti->private; unsigned val, sector_align; - val = !wrt ? READ_ONCE(max_read_size) : READ_ONCE(max_write_size); - if (likely(!val)) - val = !wrt ? DM_CRYPT_DEFAULT_MAX_READ_SIZE : DM_CRYPT_DEFAULT_MAX_WRITE_SIZE; - if (wrt || cc->used_tag_size) { - if (unlikely(val > BIO_MAX_VECS << PAGE_SHIFT)) - val = BIO_MAX_VECS << PAGE_SHIFT; - } - sector_align = max(bdev_logical_block_size(cc->dev->bdev), (unsigned)cc->sector_size); + bool wrt = op_is_write(bio_op(bio)); + + if (wrt) { + /* + * For zoned devices, splitting write operations creates the + * risk of deadlocking queue freeze operations with zone write + * plugging BIO work when the reminder of a split BIO is + * issued. So always allow the entire BIO to proceed. + */ + if (ti->emulate_zone_append) + return bio_sectors(bio); + + val = min_not_zero(READ_ONCE(max_write_size), + DM_CRYPT_DEFAULT_MAX_WRITE_SIZE); + } else { + val = min_not_zero(READ_ONCE(max_read_size), + DM_CRYPT_DEFAULT_MAX_READ_SIZE); + } + + if (wrt || cc->used_tag_size) + val = min(val, BIO_MAX_VECS << PAGE_SHIFT); + + sector_align = max(bdev_logical_block_size(cc->dev->bdev), + (unsigned)cc->sector_size); val = round_down(val, sector_align); if (unlikely(!val)) val = sector_align; @@ -516,7 +535,10 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, { struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; SHASH_DESC_ON_STACK(desc, lmk->hash_tfm); - struct md5_state md5state; + union { + struct md5_state md5state; + u8 state[CRYPTO_MD5_STATESIZE]; + } u; __le32 buf[4]; int i, r; @@ -547,13 +569,13 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, return r; /* No MD5 padding here */ - r = crypto_shash_export(desc, &md5state); + r = crypto_shash_export(desc, &u.md5state); if (r) return r; for (i = 0; i < MD5_HASH_WORDS; i++) - __cpu_to_le32s(&md5state.hash[i]); - memcpy(iv, &md5state.hash, cc->iv_size); + __cpu_to_le32s(&u.md5state.hash[i]); + memcpy(iv, &u.md5state.hash, cc->iv_size); return 0; } @@ -606,10 +628,6 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc) tcw->iv_seed = NULL; kfree_sensitive(tcw->whitening); tcw->whitening = NULL; - - if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) - crypto_free_shash(tcw->crc32_tfm); - tcw->crc32_tfm = NULL; } static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -627,13 +645,6 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, return -EINVAL; } - tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, - CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(tcw->crc32_tfm)) { - ti->error = "Error initializing CRC32 in TCW"; - return PTR_ERR(tcw->crc32_tfm); - } - tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL); if (!tcw->iv_seed || !tcw->whitening) { @@ -667,36 +678,28 @@ static int crypt_iv_tcw_wipe(struct crypt_config *cc) return 0; } -static int crypt_iv_tcw_whitening(struct crypt_config *cc, - struct dm_crypt_request *dmreq, - u8 *data) +static void crypt_iv_tcw_whitening(struct crypt_config *cc, + struct dm_crypt_request *dmreq, u8 *data) { struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 buf[TCW_WHITENING_SIZE]; - SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm); - int i, r; + int i; /* xor whitening with sector number */ crypto_xor_cpy(buf, tcw->whitening, (u8 *)§or, 8); crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)§or, 8); /* calculate crc32 for every 32bit part and xor it */ - desc->tfm = tcw->crc32_tfm; - for (i = 0; i < 4; i++) { - r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]); - if (r) - goto out; - } + for (i = 0; i < 4; i++) + put_unaligned_le32(crc32(0, &buf[i * 4], 4), &buf[i * 4]); crypto_xor(&buf[0], &buf[12], 4); crypto_xor(&buf[4], &buf[8], 4); /* apply whitening (8 bytes) to whole sector */ for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) crypto_xor(data + i * 8, buf, 8); -out: memzero_explicit(buf, sizeof(buf)); - return r; } static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, @@ -706,13 +709,12 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 *src; - int r = 0; /* Remove whitening from ciphertext */ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { sg = crypt_get_sg_data(cc, dmreq->sg_in); src = kmap_local_page(sg_page(sg)); - r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); + crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); kunmap_local(src); } @@ -722,7 +724,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)§or, cc->iv_size - 8); - return r; + return 0; } static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, @@ -730,7 +732,6 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, { struct scatterlist *sg; u8 *dst; - int r; if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) return 0; @@ -738,10 +739,10 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, /* Apply whitening on ciphertext */ sg = crypt_get_sg_data(cc, dmreq->sg_out); dst = kmap_local_page(sg_page(sg)); - r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); + crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); kunmap_local(dst); - return r; + return 0; } static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, @@ -1187,7 +1188,7 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) tag_len = io->cc->tuple_size * (bio_sectors(bio) >> io->cc->sector_shift); - bip->bip_iter.bi_sector = io->cc->start + io->sector; + bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), tag_len, offset_in_page(io->integrity_metadata)); @@ -1209,11 +1210,11 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) return -EINVAL; } - if (bi->tuple_size < cc->used_tag_size) { + if (bi->metadata_size < cc->used_tag_size) { ti->error = "Integrity profile tag size mismatch."; return -EINVAL; } - cc->tuple_size = bi->tuple_size; + cc->tuple_size = bi->metadata_size; if (1 << bi->interval_exp != cc->sector_size) { ti->error = "Integrity profile sector size mismatch."; return -EINVAL; @@ -1256,6 +1257,7 @@ static void crypt_convert_init(struct crypt_config *cc, if (bio_out) ctx->iter_out = bio_out->bi_iter; ctx->cc_sector = sector + cc->iv_offset; + ctx->tag_offset = 0; init_completion(&ctx->restart); } @@ -1588,7 +1590,6 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_ static blk_status_t crypt_convert(struct crypt_config *cc, struct convert_context *ctx, bool atomic, bool reset_pending) { - unsigned int tag_offset = 0; unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT; int r; @@ -1611,9 +1612,9 @@ static blk_status_t crypt_convert(struct crypt_config *cc, atomic_inc(&ctx->cc_pending); if (crypt_integrity_aead(cc)) - r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset); + r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, ctx->tag_offset); else - r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset); + r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, ctx->tag_offset); switch (r) { /* @@ -1633,8 +1634,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc, * exit and continue processing in a workqueue */ ctx->r.req = NULL; + ctx->tag_offset++; ctx->cc_sector += sector_step; - tag_offset++; return BLK_STS_DEV_RESOURCE; } } else { @@ -1648,8 +1649,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc, */ case -EINPROGRESS: ctx->r.req = NULL; + ctx->tag_offset++; ctx->cc_sector += sector_step; - tag_offset++; continue; /* * The request was already processed (synchronously). @@ -1657,7 +1658,7 @@ static blk_status_t crypt_convert(struct crypt_config *cc, case 0: atomic_dec(&ctx->cc_pending); ctx->cc_sector += sector_step; - tag_offset++; + ctx->tag_offset++; if (!atomic) cond_resched(); continue; @@ -1719,6 +1720,7 @@ retry: clone->bi_private = io; clone->bi_end_io = crypt_endio; clone->bi_ioprio = io->base_bio->bi_ioprio; + clone->bi_iter.bi_sector = cc->start + io->sector; remaining_size = size; @@ -1909,7 +1911,6 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) crypt_dec_pending(io); return 1; } - clone->bi_iter.bi_sector = cc->start + io->sector; crypt_convert_init(cc, &io->ctx, clone, clone, io->sector); io->saved_bi_iter = clone->bi_iter; dm_submit_bio_remap(io->base_bio, clone); @@ -1925,13 +1926,13 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) clone = bio_alloc_clone(cc->dev->bdev, io->base_bio, gfp, &cc->bs); if (!clone) return 1; + + clone->bi_iter.bi_sector = cc->start + io->sector; clone->bi_private = io; clone->bi_end_io = crypt_endio; crypt_inc_pending(io); - clone->bi_iter.bi_sector = cc->start + io->sector; - if (dm_crypt_integrity_io_alloc(io, clone)) { crypt_dec_pending(io); bio_put(clone); @@ -2039,8 +2040,6 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) /* crypt_convert should have filled the clone bio */ BUG_ON(io->ctx.iter_out.bi_size); - clone->bi_iter.bi_sector = cc->start + io->sector; - if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) || test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) { dm_submit_bio_remap(io->base_bio, clone); @@ -2092,13 +2091,12 @@ static void kcryptd_crypt_write_continue(struct work_struct *work) struct crypt_config *cc = io->cc; struct convert_context *ctx = &io->ctx; int crypt_finished; - sector_t sector = io->sector; blk_status_t r; wait_for_completion(&ctx->restart); reinit_completion(&ctx->restart); - r = crypt_convert(cc, &io->ctx, true, false); + r = crypt_convert(cc, &io->ctx, false, false); if (r) io->error = r; crypt_finished = atomic_dec_and_test(&ctx->cc_pending); @@ -2109,10 +2107,8 @@ static void kcryptd_crypt_write_continue(struct work_struct *work) } /* Encryption was already finished, submit io now */ - if (crypt_finished) { + if (crypt_finished) kcryptd_crypt_write_io_submit(io, 0); - io->sector = sector; - } crypt_dec_pending(io); } @@ -2123,14 +2119,13 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) struct convert_context *ctx = &io->ctx; struct bio *clone; int crypt_finished; - sector_t sector = io->sector; blk_status_t r; /* * Prevent io from disappearing until this function completes. */ crypt_inc_pending(io); - crypt_convert_init(cc, ctx, NULL, io->base_bio, sector); + crypt_convert_init(cc, ctx, NULL, io->base_bio, io->sector); clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); if (unlikely(!clone)) { @@ -2147,8 +2142,6 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) io->ctx.iter_in = clone->bi_iter; } - sector += bio_sectors(clone); - crypt_inc_pending(io); r = crypt_convert(cc, ctx, test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags), true); @@ -2172,10 +2165,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) } /* Encryption was already finished, submit io now */ - if (crypt_finished) { + if (crypt_finished) kcryptd_crypt_write_io_submit(io, 0); - io->sector = sector; - } dec: crypt_dec_pending(io); @@ -2203,7 +2194,7 @@ static void kcryptd_crypt_read_continue(struct work_struct *work) wait_for_completion(&io->ctx.restart); reinit_completion(&io->ctx.restart); - r = crypt_convert(cc, &io->ctx, true, false); + r = crypt_convert(cc, &io->ctx, false, false); if (r) io->error = r; @@ -2221,7 +2212,6 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) crypt_inc_pending(io); if (io->ctx.aead_recheck) { - io->ctx.cc_sector = io->sector + cc->iv_offset; r = crypt_convert(cc, &io->ctx, test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); } else { @@ -3524,7 +3514,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) /* * Check if bio is too large, split as needed. */ - max_sectors = get_max_request_size(cc, bio_data_dir(bio) == WRITE); + max_sectors = get_max_request_sectors(ti, bio); if (unlikely(bio_sectors(bio) > max_sectors)) dm_accept_partial_bio(bio, max_sectors); @@ -3761,6 +3751,17 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) max_t(unsigned int, limits->physical_block_size, cc->sector_size); limits->io_min = max_t(unsigned int, limits->io_min, cc->sector_size); limits->dma_alignment = limits->logical_block_size - 1; + + /* + * For zoned dm-crypt targets, there will be no internal splitting of + * write BIOs to avoid exceeding BIO_MAX_VECS vectors per BIO. But + * without respecting this limit, crypt_alloc_buffer() will trigger a + * BUG(). Avoid this by forcing DM core to split write BIOs to this + * limit. + */ + if (ti->emulate_zone_append) + limits->max_hw_sectors = min(limits->max_hw_sectors, + BIO_MAX_VECS << PAGE_SECTORS_SHIFT); } static struct target_type crypt_target = { diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 08f6387620c1..4bb6553278c7 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -14,11 +14,14 @@ #include <linux/bio.h> #include <linux/slab.h> #include <linux/kthread.h> +#include <linux/delay.h> #include <linux/device-mapper.h> #define DM_MSG_PREFIX "delay" +#define SLEEP_SHIFT 3 + struct delay_class { struct dm_dev *dev; sector_t start; @@ -34,6 +37,7 @@ struct delay_c { struct work_struct flush_expired_bios; struct list_head delayed_bios; struct task_struct *worker; + unsigned int worker_sleep_us; bool may_delay; struct delay_class read; @@ -52,7 +56,7 @@ struct dm_delay_info { static void handle_delayed_timer(struct timer_list *t) { - struct delay_c *dc = from_timer(dc, t, delay_timer); + struct delay_c *dc = timer_container_of(dc, t, delay_timer); queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); } @@ -136,6 +140,7 @@ static int flush_worker_fn(void *data) schedule(); } else { spin_unlock(&dc->delayed_bios_lock); + fsleep(dc->worker_sleep_us); cond_resched(); } } @@ -212,7 +217,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct delay_c *dc; int ret; - unsigned int max_delay; + unsigned int max_delay, min_delay; if (argc != 3 && argc != 6 && argc != 9) { ti->error = "Requires exactly 3, 6 or 9 arguments"; @@ -235,7 +240,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ret = delay_class_ctr(ti, &dc->read, argv); if (ret) goto bad; - max_delay = dc->read.delay; + min_delay = max_delay = dc->read.delay; if (argc == 3) { ret = delay_class_ctr(ti, &dc->write, argv); @@ -251,6 +256,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret) goto bad; max_delay = max(max_delay, dc->write.delay); + min_delay = min_not_zero(min_delay, dc->write.delay); if (argc == 6) { ret = delay_class_ctr(ti, &dc->flush, argv + 3); @@ -263,9 +269,14 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret) goto bad; max_delay = max(max_delay, dc->flush.delay); + min_delay = min_not_zero(min_delay, dc->flush.delay); out: if (max_delay < 50) { + if (min_delay >> SLEEP_SHIFT) + dc->worker_sleep_us = 1000; + else + dc->worker_sleep_us = (min_delay * 1000) >> SLEEP_SHIFT; /* * In case of small requested delays, use kthread instead of * timers and workqueue to achieve better latency. @@ -369,6 +380,21 @@ static int delay_map(struct dm_target *ti, struct bio *bio) return delay_bio(dc, c, bio); } +#ifdef CONFIG_BLK_DEV_ZONED +static int delay_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct delay_c *dc = ti->private; + struct delay_class *c = &dc->read; + + return dm_report_zones(c->dev->bdev, c->start, + c->start + dm_target_offset(ti, args->next_sector), + args, nr_zones); +} +#else +#define delay_report_zones NULL +#endif + #define DMEMIT_DELAY_CLASS(c) \ DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) @@ -423,12 +449,13 @@ out: static struct target_type delay_target = { .name = "delay", - .version = {1, 4, 0}, - .features = DM_TARGET_PASSES_INTEGRITY, + .version = {1, 5, 0}, + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, .module = THIS_MODULE, .ctr = delay_ctr, .dtr = delay_dtr, .map = delay_map, + .report_zones = delay_report_zones, .presuspend = delay_presuspend, .resume = delay_resume, .status = delay_status, diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 1a33820c9f46..e75310232bbf 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -534,7 +534,9 @@ static void dust_status(struct dm_target *ti, status_type_t type, } } -static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct dust_device *dd = ti->private; struct dm_dev *dev = dd->dev; diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index ec5db1478b2f..6abb31ca9662 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -390,6 +390,12 @@ static int ebs_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } +static void ebs_postsuspend(struct dm_target *ti) +{ + struct ebs_c *ec = ti->private; + dm_bufio_client_reset(ec->bufio); +} + static void ebs_status(struct dm_target *ti, status_type_t type, unsigned int status_flags, char *result, unsigned int maxlen) { @@ -409,7 +415,8 @@ static void ebs_status(struct dm_target *ti, status_type_t type, } } -static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, bool *forward) { struct ebs_c *ec = ti->private; struct dm_dev *dev = ec->dev; @@ -442,11 +449,12 @@ static int ebs_iterate_devices(struct dm_target *ti, static struct target_type ebs_target = { .name = "ebs", .version = {1, 0, 1}, - .features = DM_TARGET_PASSES_INTEGRITY, + .features = 0, .module = THIS_MODULE, .ctr = ebs_ctr, .dtr = ebs_dtr, .map = ebs_map, + .postsuspend = ebs_postsuspend, .status = ebs_status, .io_hints = ebs_io_hints, .prepare_ioctl = ebs_prepare_ioctl, diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 731467d4ed10..08925aca838c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -47,14 +47,15 @@ enum feature_flag_bits { }; struct per_bio_data { - bool bio_submitted; + bool bio_can_corrupt; + struct bvec_iter saved_iter; }; static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, struct dm_target *ti) { - int r; - unsigned int argc; + int r = 0; + unsigned int argc = 0; const char *arg_name; static const struct dm_arg _args[] = { @@ -65,14 +66,13 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, {0, PROBABILITY_BASE, "Invalid random corrupt argument"}, }; - /* No feature arguments supplied. */ - if (!as->argc) - return 0; - - r = dm_read_arg_group(_args, as, &argc, &ti->error); - if (r) + if (as->argc && (r = dm_read_arg_group(_args, as, &argc, &ti->error))) return r; + /* No feature arguments supplied. */ + if (!argc) + goto error_all_io; + while (argc) { arg_name = dm_shift_arg(as); argc--; @@ -128,8 +128,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags> */ if (!strcasecmp(arg_name, "corrupt_bio_byte")) { - if (!argc) { - ti->error = "Feature corrupt_bio_byte requires parameters"; + if (fc->corrupt_bio_byte) { + ti->error = "Feature corrupt_bio_byte duplicated"; + return -EINVAL; + } else if (argc < 4) { + ti->error = "Feature corrupt_bio_byte requires 4 parameters"; return -EINVAL; } @@ -176,7 +179,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, } if (!strcasecmp(arg_name, "random_read_corrupt")) { - if (!argc) { + if (fc->random_read_corrupt) { + ti->error = "Feature random_read_corrupt duplicated"; + return -EINVAL; + } else if (!argc) { ti->error = "Feature random_read_corrupt requires a parameter"; return -EINVAL; } @@ -189,7 +195,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, } if (!strcasecmp(arg_name, "random_write_corrupt")) { - if (!argc) { + if (fc->random_write_corrupt) { + ti->error = "Feature random_write_corrupt duplicated"; + return -EINVAL; + } else if (!argc) { ti->error = "Feature random_write_corrupt requires a parameter"; return -EINVAL; } @@ -205,18 +214,28 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, return -EINVAL; } - if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { - ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + if (test_bit(DROP_WRITES, &fc->flags) && + ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == WRITE) || + fc->random_write_corrupt)) { + ti->error = "drop_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; return -EINVAL; - } else if (test_bit(ERROR_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { - ti->error = "error_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + } else if (test_bit(ERROR_WRITES, &fc->flags) && + ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == WRITE) || + fc->random_write_corrupt)) { + ti->error = "error_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; + return -EINVAL; + } else if (test_bit(ERROR_READS, &fc->flags) && + ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == READ) || + fc->random_read_corrupt)) { + ti->error = "error_reads is incompatible with random_read_corrupt or corrupt_bio_byte with the READ flag set"; return -EINVAL; } if (!fc->corrupt_bio_byte && !test_bit(ERROR_READS, &fc->flags) && !test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags) && !fc->random_read_corrupt && !fc->random_write_corrupt) { +error_all_io: set_bit(ERROR_WRITES, &fc->flags); set_bit(ERROR_READS, &fc->flags); } @@ -278,7 +297,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); + r = dm_read_arg(_args + 1, &as, &fc->down_interval, &ti->error); if (r) goto bad; @@ -339,7 +358,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) } static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, - unsigned char corrupt_bio_value) + unsigned char corrupt_bio_value, + struct bvec_iter start) { struct bvec_iter iter; struct bio_vec bvec; @@ -348,7 +368,7 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, * Overwrite the Nth byte of the bio's data, on whichever page * it falls. */ - bio_for_each_segment(bvec, bio, iter) { + __bio_for_each_segment(bvec, bio, iter, start) { if (bio_iter_len(bio, iter) > corrupt_bio_byte) { unsigned char *segment = bvec_kmap_local(&bvec); segment[corrupt_bio_byte] = corrupt_bio_value; @@ -357,36 +377,31 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n", bio, corrupt_bio_value, corrupt_bio_byte, (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf, - (unsigned long long)bio->bi_iter.bi_sector, - bio->bi_iter.bi_size); + (unsigned long long)start.bi_sector, + start.bi_size); break; } corrupt_bio_byte -= bio_iter_len(bio, iter); } } -static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc, + struct bvec_iter start) { unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1; - if (!bio_has_data(bio)) - return; - - corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value); + corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value, start); } -static void corrupt_bio_random(struct bio *bio) +static void corrupt_bio_random(struct bio *bio, struct bvec_iter start) { unsigned int corrupt_byte; unsigned char corrupt_value; - if (!bio_has_data(bio)) - return; - - corrupt_byte = get_random_u32() % bio->bi_iter.bi_size; + corrupt_byte = get_random_u32() % start.bi_size; corrupt_value = get_random_u8(); - corrupt_bio_common(bio, corrupt_byte, corrupt_value); + corrupt_bio_common(bio, corrupt_byte, corrupt_value, start); } static void clone_free(struct bio *clone) @@ -426,7 +441,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b if (!clone) return NULL; - bio_init(clone, fc->dev->bdev, bio->bi_inline_vecs, nr_iovecs, bio->bi_opf); + bio_init_inline(clone, fc->dev->bdev, nr_iovecs, bio->bi_opf); clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); clone->bi_private = bio; @@ -481,7 +496,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) unsigned int elapsed; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); - pb->bio_submitted = false; + pb->bio_can_corrupt = false; if (op_is_zone_mgmt(bio_op(bio))) goto map_bio; @@ -490,14 +505,15 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) elapsed = (jiffies - fc->start_time) / HZ; if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { bool corrupt_fixed, corrupt_random; - /* - * Flag this bio as submitted while down. - */ - pb->bio_submitted = true; + + if (bio_has_data(bio)) { + pb->bio_can_corrupt = true; + pb->saved_iter = bio->bi_iter; + } /* - * Error reads if neither corrupt_bio_byte or drop_writes or error_writes are set. - * Otherwise, flakey_end_io() will decide if the reads should be modified. + * If ERROR_READS isn't set flakey_end_io() will decide if the + * reads should be modified. */ if (bio_data_dir(bio) == READ) { if (test_bit(ERROR_READS, &fc->flags)) @@ -516,6 +532,8 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } + if (!pb->bio_can_corrupt) + goto map_bio; /* * Corrupt matching writes. */ @@ -535,9 +553,11 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) struct bio *clone = clone_bio(ti, fc, bio); if (clone) { if (corrupt_fixed) - corrupt_bio_data(clone, fc); + corrupt_bio_data(clone, fc, + clone->bi_iter); if (corrupt_random) - corrupt_bio_random(clone); + corrupt_bio_random(clone, + clone->bi_iter); submit_bio(clone); return DM_MAPIO_SUBMITTED; } @@ -559,28 +579,21 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, if (op_is_zone_mgmt(bio_op(bio))) return DM_ENDIO_DONE; - if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { + if (!*error && pb->bio_can_corrupt && (bio_data_dir(bio) == READ)) { if (fc->corrupt_bio_byte) { if ((fc->corrupt_bio_rw == READ) && all_corrupt_bio_flags_match(bio, fc)) { /* * Corrupt successful matching READs while in down state. */ - corrupt_bio_data(bio, fc); + corrupt_bio_data(bio, fc, pb->saved_iter); } } if (fc->random_read_corrupt) { u64 rnd = get_random_u64(); u32 rem = do_div(rnd, PROBABILITY_BASE); if (rem < fc->random_read_corrupt) - corrupt_bio_random(bio); - } - if (test_bit(ERROR_READS, &fc->flags)) { - /* - * Error read during the down_interval if drop_writes - * and error_writes were not configured. - */ - *error = BLK_STS_IOERR; + corrupt_bio_random(bio, pb->saved_iter); } } @@ -638,7 +651,9 @@ static void flakey_status(struct dm_target *ti, status_type_t type, } } -static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct flakey_c *fc = ti->private; diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c index b90f34259fbb..efb3cd4f9cd4 100644 --- a/drivers/md/dm-ima.c +++ b/drivers/md/dm-ima.c @@ -45,7 +45,7 @@ static void fix_separator_chars(char **buf) /* * Internal function to allocate memory for IMA measurements. */ -static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio) +static void *dm_ima_alloc(size_t len, bool noio) { unsigned int noio_flag; void *ptr; @@ -53,7 +53,7 @@ static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio) if (noio) noio_flag = memalloc_noio_save(); - ptr = kzalloc(len, flags); + ptr = kzalloc(len, GFP_KERNEL); if (noio) memalloc_noio_restore(noio_flag); @@ -68,13 +68,13 @@ static int dm_ima_alloc_and_copy_name_uuid(struct mapped_device *md, char **dev_ char **dev_uuid, bool noio) { int r; - *dev_name = dm_ima_alloc(DM_NAME_LEN*2, GFP_KERNEL, noio); + *dev_name = dm_ima_alloc(DM_NAME_LEN*2, noio); if (!(*dev_name)) { r = -ENOMEM; goto error; } - *dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, GFP_KERNEL, noio); + *dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, noio); if (!(*dev_uuid)) { r = -ENOMEM; goto error; @@ -109,7 +109,7 @@ static int dm_ima_alloc_and_copy_device_data(struct mapped_device *md, char **de if (r) return r; - *device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + *device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio); if (!(*device_data)) { r = -ENOMEM; goto error; @@ -153,14 +153,12 @@ static int dm_ima_alloc_and_copy_capacity_str(struct mapped_device *md, char **c capacity = get_capacity(md->disk); - *capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, GFP_KERNEL, noio); + *capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, noio); if (!(*capacity_str)) return -ENOMEM; - scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;", - capacity); - - return 0; + return scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;", + capacity); } /* @@ -195,15 +193,15 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl const size_t hash_alg_prefix_len = strlen(DM_IMA_TABLE_HASH_ALG) + 1; char table_load_event_name[] = "dm_table_load"; - ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, GFP_KERNEL, noio); + ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, noio); if (!ima_buf) return; - target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, GFP_KERNEL, noio); + target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, noio); if (!target_metadata_buf) goto error; - target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, GFP_KERNEL, noio); + target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, noio); if (!target_data_buf) goto error; @@ -218,7 +216,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl shash->tfm = tfm; digest_size = crypto_shash_digestsize(tfm); - digest = dm_ima_alloc(digest_size, GFP_KERNEL, noio); + digest = dm_ima_alloc(digest_size, noio); if (!digest) goto error; @@ -241,10 +239,11 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl /* * First retrieve the target metadata. */ - scnprintf(target_metadata_buf, DM_IMA_TARGET_METADATA_BUF_LEN, - "target_index=%d,target_begin=%llu,target_len=%llu,", - i, ti->begin, ti->len); - target_metadata_buf_len = strlen(target_metadata_buf); + target_metadata_buf_len = + scnprintf(target_metadata_buf, + DM_IMA_TARGET_METADATA_BUF_LEN, + "target_index=%d,target_begin=%llu,target_len=%llu,", + i, ti->begin, ti->len); /* * Then retrieve the actual target data. @@ -326,7 +325,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl if (r < 0) goto error; - digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, GFP_KERNEL, noio); + digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, noio); if (!digest_buf) goto error; @@ -370,18 +369,18 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) { char *device_table_data, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL; char active[] = "active_table_hash="; - unsigned int active_len = strlen(active), capacity_len = 0; + unsigned int active_len = strlen(active); unsigned int l = 0; bool noio = true; bool nodata = true; - int r; + int capacity_len; - device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio); if (!device_table_data) return; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) + capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (capacity_len < 0) goto error; memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len); @@ -444,18 +443,14 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) } if (nodata) { - r = dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio); - if (r) + if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio)) goto error; - scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, - "%sname=%s,uuid=%s;device_resume=no_data;", - DM_IMA_VERSION_STR, dev_name, dev_uuid); - l = strlen(device_table_data); - + l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;device_resume=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); } - capacity_len = strlen(capacity_str); memcpy(device_table_data + l, capacity_str, capacity_len); l += capacity_len; @@ -484,18 +479,17 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all) unsigned int device_active_len = strlen(device_active_str); unsigned int device_inactive_len = strlen(device_inactive_str); unsigned int remove_all_len = strlen(remove_all_str); - unsigned int capacity_len = 0; unsigned int l = 0; bool noio = true; bool nodata = true; - int r; + int capacity_len; - device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, GFP_KERNEL, noio); + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, noio); if (!device_table_data) goto exit; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) { + capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (capacity_len < 0) { kfree(device_table_data); goto exit; } @@ -561,10 +555,9 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all) if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio)) goto error; - scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, - "%sname=%s,uuid=%s;device_remove=no_data;", - DM_IMA_VERSION_STR, dev_name, dev_uuid); - l = strlen(device_table_data); + l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;device_remove=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); } memcpy(device_table_data + l, remove_all_str, remove_all_len); @@ -572,7 +565,6 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all) memcpy(device_table_data + l, remove_all ? "y;" : "n;", 2); l += 2; - capacity_len = strlen(capacity_str); memcpy(device_table_data + l, capacity_str, capacity_len); l += capacity_len; @@ -604,20 +596,20 @@ exit: */ void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map) { - unsigned int l = 0, capacity_len = 0; + unsigned int l = 0; char *device_table_data = NULL, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL; char inactive_str[] = "inactive_table_hash="; unsigned int inactive_len = strlen(inactive_str); bool noio = true; bool nodata = true; - int r; + int capacity_len; - device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio); if (!device_table_data) return; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) + capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (capacity_len < 0) goto error1; memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len); @@ -647,13 +639,11 @@ void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map) if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio)) goto error2; - scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, - "%sname=%s,uuid=%s;table_clear=no_data;", - DM_IMA_VERSION_STR, dev_name, dev_uuid); - l = strlen(device_table_data); + l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;table_clear=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); } - capacity_len = strlen(capacity_str); memcpy(device_table_data + l, capacity_str, capacity_len); l += capacity_len; @@ -706,7 +696,7 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md) char *old_device_data = NULL, *new_device_data = NULL, *combined_device_data = NULL; char *new_dev_name = NULL, *new_dev_uuid = NULL, *capacity_str = NULL; bool noio = true; - int r; + int len; if (dm_ima_alloc_and_copy_device_data(md, &new_device_data, md->ima.active_table.num_targets, noio)) @@ -715,12 +705,11 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md) if (dm_ima_alloc_and_copy_name_uuid(md, &new_dev_name, &new_dev_uuid, noio)) goto error; - combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, GFP_KERNEL, noio); + combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, noio); if (!combined_device_data) goto error; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) + if (dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio) < 0) goto error; old_device_data = md->ima.active_table.device_metadata; @@ -728,12 +717,11 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md) md->ima.active_table.device_metadata = new_device_data; md->ima.active_table.device_metadata_len = strlen(new_device_data); - scnprintf(combined_device_data, DM_IMA_DEVICE_BUF_LEN * 2, - "%s%snew_name=%s,new_uuid=%s;%s", DM_IMA_VERSION_STR, old_device_data, - new_dev_name, new_dev_uuid, capacity_str); + len = scnprintf(combined_device_data, DM_IMA_DEVICE_BUF_LEN * 2, + "%s%snew_name=%s,new_uuid=%s;%s", DM_IMA_VERSION_STR, old_device_data, + new_dev_name, new_dev_uuid, capacity_str); - dm_ima_measure_data("dm_device_rename", combined_device_data, strlen(combined_device_data), - noio); + dm_ima_measure_data("dm_device_rename", combined_device_data, len, noio); goto exit; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ee9f7cecd78e..170bf67a2edd 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -21,6 +21,7 @@ #include <linux/reboot.h> #include <crypto/hash.h> #include <crypto/skcipher.h> +#include <crypto/utils.h> #include <linux/async_tx.h> #include <linux/dm-bufio.h> @@ -132,7 +133,7 @@ struct journal_sector { commit_id_t commit_id; }; -#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK])) +#define MAX_TAG_SIZE 255 #define METADATA_PADDING_SECTORS 8 @@ -218,10 +219,13 @@ struct dm_integrity_c { __u8 log2_blocks_per_bitmap_bit; unsigned char mode; + bool internal_hash; int failed; - struct crypto_shash *internal_hash; + struct crypto_shash *internal_shash; + struct crypto_ahash *internal_ahash; + unsigned int internal_hash_digestsize; struct dm_target *ti; @@ -276,6 +280,9 @@ struct dm_integrity_c { bool fix_hmac; bool legacy_recalculate; + mempool_t ahash_req_pool; + struct ahash_request *journal_ahash_req; + struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; struct alg_spec journal_mac_alg; @@ -325,6 +332,8 @@ struct dm_integrity_io { unsigned payload_len; bool integrity_payload_from_mempool; bool integrity_range_locked; + + struct ahash_request *ahash_req; }; struct journal_completion { @@ -351,6 +360,7 @@ struct bitmap_block_status { static struct kmem_cache *journal_io_cache; #define JOURNAL_IO_MEMPOOL 32 +#define AHASH_MEMPOOL 32 #ifdef DEBUG_PRINT #define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__) @@ -516,7 +526,7 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr) dm_integrity_io_error(ic, "crypto_shash_digest", r); return r; } - if (memcmp(mac, actual_mac, mac_size)) { + if (crypto_memneq(mac, actual_mac, mac_size)) { dm_integrity_io_error(ic, "superblock mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0); return -EILSEQ; @@ -859,7 +869,7 @@ static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool if (likely(wr)) memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); else { - if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { + if (crypto_memneq(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { dm_integrity_io_error(ic, "journal mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0); } @@ -1401,10 +1411,9 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, unsigned int *metadata_offset, unsigned int total_size, int op) { -#define MAY_BE_FILLER 1 -#define MAY_BE_HASH 2 unsigned int hash_offset = 0; - unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + unsigned char mismatch_hash = 0; + unsigned char mismatch_filler = !ic->discard; do { unsigned char *data, *dp; @@ -1425,7 +1434,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se if (op == TAG_READ) { memcpy(tag, dp, to_copy); } else if (op == TAG_WRITE) { - if (memcmp(dp, tag, to_copy)) { + if (crypto_memneq(dp, tag, to_copy)) { memcpy(dp, tag, to_copy); dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); } @@ -1433,29 +1442,30 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se /* e.g.: op == TAG_CMP */ if (likely(is_power_of_2(ic->tag_size))) { - if (unlikely(memcmp(dp, tag, to_copy))) - if (unlikely(!ic->discard) || - unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) { - goto thorough_test; - } + if (unlikely(crypto_memneq(dp, tag, to_copy))) + goto thorough_test; } else { unsigned int i, ts; thorough_test: ts = total_size; for (i = 0; i < to_copy; i++, ts--) { - if (unlikely(dp[i] != tag[i])) - may_be &= ~MAY_BE_HASH; - if (likely(dp[i] != DISCARD_FILLER)) - may_be &= ~MAY_BE_FILLER; + /* + * Warning: the control flow must not be + * dependent on match/mismatch of + * individual bytes. + */ + mismatch_hash |= dp[i] ^ tag[i]; + mismatch_filler |= dp[i] ^ DISCARD_FILLER; hash_offset++; if (unlikely(hash_offset == ic->tag_size)) { - if (unlikely(!may_be)) { + if (unlikely(mismatch_hash) && unlikely(mismatch_filler)) { dm_bufio_release(b); return ts; } hash_offset = 0; - may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + mismatch_hash = 0; + mismatch_filler = !ic->discard; } } } @@ -1476,8 +1486,6 @@ thorough_test: } while (unlikely(total_size)); return 0; -#undef MAY_BE_FILLER -#undef MAY_BE_HASH } struct flush_request { @@ -1541,7 +1549,8 @@ static void sleep_on_endio_wait(struct dm_integrity_c *ic) static void autocommit_fn(struct timer_list *t) { - struct dm_integrity_c *ic = from_timer(ic, t, autocommit_timer); + struct dm_integrity_c *ic = timer_container_of(ic, t, + autocommit_timer); if (likely(!dm_integrity_failed(ic))) queue_work(ic->commit_wq, &ic->commit_work); @@ -1634,15 +1643,15 @@ static void integrity_end_io(struct bio *bio) dec_in_flight(dio); } -static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector, - const char *data, char *result) +static void integrity_sector_checksum_shash(struct dm_integrity_c *ic, sector_t sector, + const char *data, unsigned offset, char *result) { __le64 sector_le = cpu_to_le64(sector); - SHASH_DESC_ON_STACK(req, ic->internal_hash); + SHASH_DESC_ON_STACK(req, ic->internal_shash); int r; unsigned int digest_size; - req->tfm = ic->internal_hash; + req->tfm = ic->internal_shash; r = crypto_shash_init(req); if (unlikely(r < 0)) { @@ -1664,7 +1673,7 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector goto failed; } - r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT); + r = crypto_shash_update(req, data + offset, ic->sectors_per_block << SECTOR_SHIFT); if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_update", r); goto failed; @@ -1676,7 +1685,70 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector goto failed; } - digest_size = crypto_shash_digestsize(ic->internal_hash); + digest_size = ic->internal_hash_digestsize; + if (unlikely(digest_size < ic->tag_size)) + memset(result + digest_size, 0, ic->tag_size - digest_size); + + return; + +failed: + /* this shouldn't happen anyway, the hash functions have no reason to fail */ + get_random_bytes(result, ic->tag_size); +} + +static void integrity_sector_checksum_ahash(struct dm_integrity_c *ic, struct ahash_request **ahash_req, + sector_t sector, struct page *page, unsigned offset, char *result) +{ + __le64 sector_le = cpu_to_le64(sector); + struct ahash_request *req; + DECLARE_CRYPTO_WAIT(wait); + struct scatterlist sg[3], *s = sg; + int r; + unsigned int digest_size; + unsigned int nbytes = 0; + + might_sleep(); + + req = *ahash_req; + if (unlikely(!req)) { + req = mempool_alloc(&ic->ahash_req_pool, GFP_NOIO); + *ahash_req = req; + } + + ahash_request_set_tfm(req, ic->internal_ahash); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + sg_init_table(sg, 3); + sg_set_buf(s, (const __u8 *)&ic->sb->salt, SALT_SIZE); + nbytes += SALT_SIZE; + s++; + } else { + sg_init_table(sg, 2); + } + + if (likely(!is_vmalloc_addr(§or_le))) { + sg_set_buf(s, §or_le, sizeof(sector_le)); + } else { + struct page *sec_page = vmalloc_to_page(§or_le); + unsigned int sec_off = offset_in_page(§or_le); + sg_set_page(s, sec_page, sizeof(sector_le), sec_off); + } + nbytes += sizeof(sector_le); + s++; + + sg_set_page(s, page, ic->sectors_per_block << SECTOR_SHIFT, offset); + nbytes += ic->sectors_per_block << SECTOR_SHIFT; + + ahash_request_set_crypt(req, sg, result, nbytes); + + r = crypto_wait_req(crypto_ahash_digest(req), &wait); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_ahash_digest", r); + goto failed; + } + + digest_size = ic->internal_hash_digestsize; if (unlikely(digest_size < ic->tag_size)) memset(result + digest_size, 0, ic->tag_size - digest_size); @@ -1687,6 +1759,41 @@ failed: get_random_bytes(result, ic->tag_size); } +static void integrity_sector_checksum(struct dm_integrity_c *ic, struct ahash_request **ahash_req, + sector_t sector, const char *data, unsigned offset, char *result) +{ + if (likely(ic->internal_shash != NULL)) + integrity_sector_checksum_shash(ic, sector, data, offset, result); + else + integrity_sector_checksum_ahash(ic, ahash_req, sector, (struct page *)data, offset, result); +} + +static void *integrity_kmap(struct dm_integrity_c *ic, struct page *p) +{ + if (likely(ic->internal_shash != NULL)) + return kmap_local_page(p); + else + return p; +} + +static void integrity_kunmap(struct dm_integrity_c *ic, const void *ptr) +{ + if (likely(ic->internal_shash != NULL)) + kunmap_local(ptr); +} + +static void *integrity_identity(struct dm_integrity_c *ic, void *data) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(offset_in_page(data)); + BUG_ON(!virt_addr_valid(data)); +#endif + if (likely(ic->internal_shash != NULL)) + return data; + else + return virt_to_page(data); +} + static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checksum) { struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); @@ -1711,6 +1818,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks sector_t alignment; char *mem; char *buffer = page_to_virt(page); + unsigned int buffer_offset; int r; struct dm_io_request io_req; struct dm_io_region io_loc; @@ -1728,7 +1836,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks alignment &= -alignment; io_loc.sector = round_down(io_loc.sector, alignment); io_loc.count += sector - io_loc.sector; - buffer += (sector - io_loc.sector) << SECTOR_SHIFT; + buffer_offset = (sector - io_loc.sector) << SECTOR_SHIFT; io_loc.count = round_up(io_loc.count, alignment); r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT); @@ -1737,7 +1845,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks goto free_ret; } - integrity_sector_checksum(ic, logical_sector, buffer, checksum); + integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, integrity_identity(ic, buffer), buffer_offset, checksum); r = dm_integrity_rw_tag(ic, checksum, &dio->metadata_block, &dio->metadata_offset, ic->tag_size, TAG_CMP); if (r) { @@ -1754,7 +1862,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks } mem = bvec_kmap_local(&bv); - memcpy(mem + pos, buffer, ic->sectors_per_block << SECTOR_SHIFT); + memcpy(mem + pos, buffer + buffer_offset, ic->sectors_per_block << SECTOR_SHIFT); kunmap_local(mem); pos += ic->sectors_per_block << SECTOR_SHIFT; @@ -1776,7 +1884,7 @@ static void integrity_metadata(struct work_struct *w) if (ic->internal_hash) { struct bvec_iter iter; struct bio_vec bv; - unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); + unsigned int digest_size = ic->internal_hash_digestsize; struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; @@ -1837,17 +1945,17 @@ static void integrity_metadata(struct work_struct *w) char *mem, *checksums_ptr; again: - mem = bvec_kmap_local(&bv_copy); + mem = integrity_kmap(ic, bv_copy.bv_page); pos = 0; checksums_ptr = checksums; do { - integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr); + integrity_sector_checksum(ic, &dio->ahash_req, sector, mem, bv_copy.bv_offset + pos, checksums_ptr); checksums_ptr += ic->tag_size; sectors_to_process -= ic->sectors_per_block; pos += ic->sectors_per_block << SECTOR_SHIFT; sector += ic->sectors_per_block; } while (pos < bv_copy.bv_len && sectors_to_process && checksums != checksums_onstack); - kunmap_local(mem); + integrity_kunmap(ic, mem); r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); @@ -1949,6 +2057,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) dio->ic = ic; dio->bi_status = 0; dio->op = bio_op(bio); + dio->ahash_req = NULL; if (ic->mode == 'I') { bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector); @@ -2071,19 +2180,6 @@ retry_kmap: js++; mem_ptr += 1 << SECTOR_SHIFT; } while (++s < ic->sectors_per_block); -#ifdef INTERNAL_VERIFY - if (ic->internal_hash) { - char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; - - integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); - if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { - DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", - logical_sector); - dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum", - bio, logical_sector, 0); - } - } -#endif } if (!ic->internal_hash) { @@ -2124,15 +2220,17 @@ retry_kmap: } while (++s < ic->sectors_per_block); if (ic->internal_hash) { - unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); + unsigned int digest_size = ic->internal_hash_digestsize; + void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js)); + unsigned js_offset = offset_in_page(js); if (unlikely(digest_size > ic->tag_size)) { char checksums_onstack[HASH_MAX_DIGESTSIZE]; - integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack); + integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, checksums_onstack); memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size); } else - integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je)); + integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, journal_entry_tag(ic, je)); } journal_entry_set_sector(je, logical_sector); @@ -2428,7 +2526,7 @@ retry: if (!dio->integrity_payload) { unsigned digest_size, extra_size; dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block); - digest_size = crypto_shash_digestsize(ic->internal_hash); + digest_size = ic->internal_hash_digestsize; extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; dio->payload_len += extra_size; dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -2505,11 +2603,11 @@ skip_spinlock: unsigned pos = 0; while (dio->bio_details.bi_iter.bi_size) { struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); - const char *mem = bvec_kmap_local(&bv); + const char *mem = integrity_kmap(ic, bv.bv_page); if (ic->tag_size < ic->tuple_size) memset(dio->integrity_payload + pos + ic->tag_size, 0, ic->tuple_size - ic->tuple_size); - integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, dio->integrity_payload + pos); - kunmap_local(mem); + integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, dio->integrity_payload + pos); + integrity_kunmap(ic, mem); pos += ic->tuple_size; bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); } @@ -2558,14 +2656,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w) char *mem; outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios); - - r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0); - if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) { - bio_put(outgoing_bio); - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - return; - } + bio_add_virt_nofail(outgoing_bio, outgoing_data, + ic->sectors_per_block << SECTOR_SHIFT); bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1); if (IS_ERR(bip)) { @@ -2594,8 +2686,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w) } bio_put(outgoing_bio); - integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); - if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, integrity_identity(ic, outgoing_data), 0, digest); + if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(ic->internal_hash_digestsize, ic->tag_size)))) { DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); atomic64_inc(&ic->number_of_mismatches); @@ -2618,33 +2710,58 @@ static void dm_integrity_inline_recheck(struct work_struct *w) bio_endio(bio); } +static inline bool dm_integrity_check(struct dm_integrity_c *ic, struct dm_integrity_io *dio) +{ + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + unsigned pos = 0; + + while (dio->bio_details.bi_iter.bi_size) { + char digest[HASH_MAX_DIGESTSIZE]; + struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); + char *mem = integrity_kmap(ic, bv.bv_page); + integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, digest); + if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos, + min(ic->internal_hash_digestsize, ic->tag_size)))) { + integrity_kunmap(ic, mem); + dm_integrity_free_payload(dio); + INIT_WORK(&dio->work, dm_integrity_inline_recheck); + queue_work(ic->offload_wq, &dio->work); + return false; + } + integrity_kunmap(ic, mem); + pos += ic->tuple_size; + bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } + + return true; +} + +static void dm_integrity_inline_async_check(struct work_struct *w) +{ + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + struct dm_integrity_c *ic = dio->ic; + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + + if (likely(dm_integrity_check(ic, dio))) + bio_endio(bio); +} + static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) { struct dm_integrity_c *ic = ti->private; + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); if (ic->mode == 'I') { - struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); - if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) { - unsigned pos = 0; + if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK) && likely(dio->bio_details.bi_iter.bi_size != 0)) { if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && unlikely(dio->integrity_range_locked)) - goto skip_check; - while (dio->bio_details.bi_iter.bi_size) { - char digest[HASH_MAX_DIGESTSIZE]; - struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); - char *mem = bvec_kmap_local(&bv); - //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); - integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); - if (unlikely(memcmp(digest, dio->integrity_payload + pos, - min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { - kunmap_local(mem); - dm_integrity_free_payload(dio); - INIT_WORK(&dio->work, dm_integrity_inline_recheck); - queue_work(ic->offload_wq, &dio->work); + goto skip_check; + if (likely(ic->internal_shash != NULL)) { + if (unlikely(!dm_integrity_check(ic, dio))) return DM_ENDIO_INCOMPLETE; - } - kunmap_local(mem); - pos += ic->tuple_size; - bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } else { + INIT_WORK(&dio->work, dm_integrity_inline_async_check); + queue_work(ic->offload_wq, &dio->work); + return DM_ENDIO_INCOMPLETE; } } skip_check: @@ -2652,6 +2769,8 @@ skip_check: if (unlikely(dio->integrity_range_locked)) remove_range(ic, &dio->range); } + if (unlikely(dio->ahash_req)) + mempool_free(dio->ahash_req, &ic->ahash_req_pool); return DM_ENDIO_DONE; } @@ -2708,7 +2827,7 @@ static void integrity_commit(struct work_struct *w) unsigned int i, j, n; struct bio *flushes; - del_timer(&ic->autocommit_timer); + timer_delete(&ic->autocommit_timer); if (ic->mode == 'I') return; @@ -2908,10 +3027,13 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start #endif ic->internal_hash) { char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + struct journal_sector *js = access_journal_data(ic, i, l); + void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js)); + unsigned js_offset = offset_in_page(js); - integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), - (char *)access_journal_data(ic, i, l), test_tag); - if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { + integrity_sector_checksum(ic, &ic->journal_ahash_req, sec + ((l - j) << ic->sb->log2_sectors_per_block), + js_page, js_offset, test_tag); + if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0); } @@ -2993,6 +3115,7 @@ static void integrity_recalc(struct work_struct *w) size_t recalc_tags_size; u8 *recalc_buffer = NULL; u8 *recalc_tags = NULL; + struct ahash_request *ahash_req = NULL; struct dm_integrity_range range; struct dm_io_request io_req; struct dm_io_region io_loc; @@ -3007,7 +3130,7 @@ static void integrity_recalc(struct work_struct *w) unsigned recalc_sectors = RECALC_SECTORS; retry: - recalc_buffer = __vmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO); + recalc_buffer = kmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO | __GFP_NOWARN); if (!recalc_buffer) { oom: recalc_sectors >>= 1; @@ -3017,11 +3140,11 @@ oom: goto free_ret; } recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; - if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size) - recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size; + if (ic->internal_hash_digestsize > ic->tag_size) + recalc_tags_size += ic->internal_hash_digestsize - ic->tag_size; recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO); if (!recalc_tags) { - vfree(recalc_buffer); + kfree(recalc_buffer); recalc_buffer = NULL; goto oom; } @@ -3087,7 +3210,7 @@ next_chunk: goto err; io_req.bi_opf = REQ_OP_READ; - io_req.mem.type = DM_IO_VMA; + io_req.mem.type = DM_IO_KMEM; io_req.mem.ptr.addr = recalc_buffer; io_req.notify.fn = NULL; io_req.client = ic->io; @@ -3103,7 +3226,10 @@ next_chunk: t = recalc_tags; for (i = 0; i < n_sectors; i += ic->sectors_per_block) { - integrity_sector_checksum(ic, logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t); + void *ptr = recalc_buffer + (i << SECTOR_SHIFT); + void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr)); + unsigned ptr_offset = offset_in_page(ptr); + integrity_sector_checksum(ic, &ahash_req, logical_sector + i, ptr_page, ptr_offset, t); t += ic->tag_size; } @@ -3145,8 +3271,9 @@ unlock_ret: recalc_write_super(ic); free_ret: - vfree(recalc_buffer); + kfree(recalc_buffer); kvfree(recalc_tags); + mempool_free(ahash_req, &ic->ahash_req_pool); } static void integrity_recalc_inline(struct work_struct *w) @@ -3155,6 +3282,7 @@ static void integrity_recalc_inline(struct work_struct *w) size_t recalc_tags_size; u8 *recalc_buffer = NULL; u8 *recalc_tags = NULL; + struct ahash_request *ahash_req = NULL; struct dm_integrity_range range; struct bio *bio; struct bio_integrity_payload *bip; @@ -3177,8 +3305,8 @@ oom: } recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size; - if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size) - recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size; + if (ic->internal_hash_digestsize > ic->tuple_size) + recalc_tags_size += ic->internal_hash_digestsize - ic->tuple_size; recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN); if (!recalc_tags) { kfree(recalc_buffer); @@ -3212,7 +3340,8 @@ next_chunk: bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios); bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector; - __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer)); + bio_add_virt_nofail(bio, recalc_buffer, + range.n_sectors << SECTOR_SHIFT); r = submit_bio_wait(bio); bio_put(bio); if (unlikely(r)) { @@ -3222,14 +3351,18 @@ next_chunk: t = recalc_tags; for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) { + void *ptr = recalc_buffer + (i << SECTOR_SHIFT); + void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr)); + unsigned ptr_offset = offset_in_page(ptr); memset(t, 0, ic->tuple_size); - integrity_sector_checksum(ic, range.logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t); + integrity_sector_checksum(ic, &ahash_req, range.logical_sector + i, ptr_page, ptr_offset, t); t += ic->tuple_size; } bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios); bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector; - __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer)); + bio_add_virt_nofail(bio, recalc_buffer, + range.n_sectors << SECTOR_SHIFT); bip = bio_integrity_alloc(bio, GFP_NOIO, 1); if (unlikely(IS_ERR(bip))) { @@ -3274,6 +3407,7 @@ unlock_ret: free_ret: kfree(recalc_buffer); kfree(recalc_tags); + mempool_free(ahash_req, &ic->ahash_req_pool); } static void bitmap_block_work(struct work_struct *w) @@ -3607,7 +3741,7 @@ static void dm_integrity_postsuspend(struct dm_target *ti) WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier)); - del_timer_sync(&ic->autocommit_timer); + timer_delete_sync(&ic->autocommit_timer); if (ic->recalc_wq) drain_workqueue(ic->recalc_wq); @@ -3790,20 +3924,18 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: { - __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; - - watermark_percentage += ic->journal_entries / 2; - do_div(watermark_percentage, ic->journal_entries); - arg_count = 3; + arg_count = 1; /* buffer_sectors */ arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); arg_count += ic->reset_recalculate_flag; arg_count += ic->discard; - arg_count += ic->mode == 'J'; - arg_count += ic->mode == 'J'; - arg_count += ic->mode == 'B'; - arg_count += ic->mode == 'B'; + arg_count += ic->mode != 'I'; /* interleave_sectors */ + arg_count += ic->mode == 'J'; /* journal_sectors */ + arg_count += ic->mode == 'J'; /* journal_watermark */ + arg_count += ic->mode == 'J'; /* commit_time */ + arg_count += ic->mode == 'B'; /* sectors_per_bit */ + arg_count += ic->mode == 'B'; /* bitmap_flush_interval */ arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; @@ -3822,10 +3954,15 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" reset_recalculate"); if (ic->discard) DMEMIT(" allow_discards"); - DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); - DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); + if (ic->mode != 'I') + DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); if (ic->mode == 'J') { + __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; + + watermark_percentage += ic->journal_entries / 2; + do_div(watermark_percentage, ic->journal_entries); + DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" journal_watermark:%u", (unsigned int)watermark_percentage); DMEMIT(" commit_time:%u", ic->autocommit_msec); } @@ -3907,8 +4044,8 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim struct blk_integrity *bi = &limits->integrity; memset(bi, 0, sizeof(*bi)); - bi->tuple_size = ic->tag_size; - bi->tag_size = bi->tuple_size; + bi->metadata_size = ic->tag_size; + bi->tag_size = bi->metadata_size; bi->interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT; } @@ -4211,30 +4348,53 @@ nomem: return -ENOMEM; } -static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, - char *error_alg, char *error_key) +static int get_mac(struct crypto_shash **shash, struct crypto_ahash **ahash, + struct alg_spec *a, char **error, char *error_alg, char *error_key) { int r; if (a->alg_string) { - *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(*hash)) { - *error = error_alg; - r = PTR_ERR(*hash); - *hash = NULL; - return r; - } - - if (a->key) { - r = crypto_shash_setkey(*hash, a->key, a->key_size); - if (r) { + if (shash) { + *shash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(*shash)) { + *shash = NULL; + goto try_ahash; + } + if (a->key) { + r = crypto_shash_setkey(*shash, a->key, a->key_size); + if (r) { + *error = error_key; + return r; + } + } else if (crypto_shash_get_flags(*shash) & CRYPTO_TFM_NEED_KEY) { *error = error_key; + return -ENOKEY; + } + return 0; + } +try_ahash: + if (ahash) { + *ahash = crypto_alloc_ahash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(*ahash)) { + *error = error_alg; + r = PTR_ERR(*ahash); + *ahash = NULL; return r; } - } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) { - *error = error_key; - return -ENOKEY; + if (a->key) { + r = crypto_ahash_setkey(*ahash, a->key, a->key_size); + if (r) { + *error = error_key; + return r; + } + } else if (crypto_ahash_get_flags(*ahash) & CRYPTO_TFM_NEED_KEY) { + *error = error_key; + return -ENOKEY; + } + return 0; } + *error = error_alg; + return -ENOENT; } return 0; @@ -4691,12 +4851,26 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv buffer_sectors = 1; ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT); - r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error, + r = get_mac(&ic->internal_shash, &ic->internal_ahash, &ic->internal_hash_alg, &ti->error, "Invalid internal hash", "Error setting internal hash key"); if (r) goto bad; + if (ic->internal_shash) { + ic->internal_hash = true; + ic->internal_hash_digestsize = crypto_shash_digestsize(ic->internal_shash); + } + if (ic->internal_ahash) { + ic->internal_hash = true; + ic->internal_hash_digestsize = crypto_ahash_digestsize(ic->internal_ahash); + r = mempool_init_kmalloc_pool(&ic->ahash_req_pool, AHASH_MEMPOOL, + sizeof(struct ahash_request) + crypto_ahash_reqsize(ic->internal_ahash)); + if (r) { + ti->error = "Cannot allocate mempool"; + goto bad; + } + } - r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error, + r = get_mac(&ic->journal_mac, NULL, &ic->journal_mac_alg, &ti->error, "Invalid journal mac", "Error setting journal mac key"); if (r) goto bad; @@ -4707,7 +4881,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv r = -EINVAL; goto bad; } - ic->tag_size = crypto_shash_digestsize(ic->internal_hash); + ic->tag_size = ic->internal_hash_digestsize; } if (ic->tag_size > MAX_TAG_SIZE) { ti->error = "Too big tag size"; @@ -4747,18 +4921,18 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv ti->error = "Integrity profile not supported"; goto bad; } - /*printk("tag_size: %u, tuple_size: %u\n", bi->tag_size, bi->tuple_size);*/ - if (bi->tuple_size < ic->tag_size) { + /*printk("tag_size: %u, metadata_size: %u\n", bi->tag_size, bi->metadata_size);*/ + if (bi->metadata_size < ic->tag_size) { r = -EINVAL; ti->error = "The integrity profile is smaller than tag size"; goto bad; } - if ((unsigned long)bi->tuple_size > PAGE_SIZE / 2) { + if ((unsigned long)bi->metadata_size > PAGE_SIZE / 2) { r = -EINVAL; ti->error = "Too big tuple size"; goto bad; } - ic->tuple_size = bi->tuple_size; + ic->tuple_size = bi->metadata_size; if (1 << bi->interval_exp != ic->sectors_per_block << SECTOR_SHIFT) { r = -EINVAL; ti->error = "Integrity profile sector size mismatch"; @@ -4808,23 +4982,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS); if (r) { ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recalc_bios, 1); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } } ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", @@ -5081,16 +5243,19 @@ try_smaller_buffer: ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); if (!ic->recalc_bitmap) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); if (!ic->may_write_bitmap) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL); if (!ic->bbs) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } @@ -5171,7 +5336,7 @@ static void dm_integrity_dtr(struct dm_target *ti) BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); BUG_ON(!list_empty(&ic->wait_list)); - if (ic->mode == 'B') + if (ic->mode == 'B' && ic->bitmap_flush_work.work.func) cancel_delayed_work_sync(&ic->bitmap_flush_work); if (ic->metadata_wq) destroy_workqueue(ic->metadata_wq); @@ -5188,6 +5353,8 @@ static void dm_integrity_dtr(struct dm_target *ti) kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); + mempool_free(ic->journal_ahash_req, &ic->ahash_req_pool); + mempool_exit(&ic->ahash_req_pool); bioset_exit(&ic->recalc_bios); bioset_exit(&ic->recheck_bios); mempool_exit(&ic->recheck_pool); @@ -5225,8 +5392,10 @@ static void dm_integrity_dtr(struct dm_target *ti) if (ic->sb) free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT); - if (ic->internal_hash) - crypto_free_shash(ic->internal_hash); + if (ic->internal_shash) + crypto_free_shash(ic->internal_shash); + if (ic->internal_ahash) + crypto_free_ahash(ic->internal_ahash); free_alg(&ic->internal_hash_alg); if (ic->journal_crypt) @@ -5243,7 +5412,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index d7a8e2f40db3..c37668790577 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -379,6 +379,7 @@ static void do_region(const blk_opf_t opf, unsigned int region, atomic_inc(&io->count); submit_bio(bio); + WARN_ON_ONCE(opf & REQ_ATOMIC && remaining); } while (remaining); } diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index d42eac944eb5..4165fef4c170 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1885,6 +1885,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, {DM_DEV_ARM_POLL_CMD, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, {DM_GET_TARGET_VERSION_CMD, 0, get_target_version}, + {DM_MPATH_PROBE_PATHS_CMD, 0, NULL}, /* block device ioctl */ }; if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 49fb0f684193..73bf290af181 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -119,7 +119,9 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } -static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct linear_c *lc = ti->private; struct dm_dev *dev = lc->dev; @@ -168,7 +170,7 @@ static struct dax_device *linear_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, - pfn_t *pfn) + unsigned long *pfn) { struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); @@ -199,9 +201,10 @@ static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, static struct target_type linear_target = { .name = "linear", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | - DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO, + DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO | + DM_TARGET_ATOMIC_WRITES, .report_zones = linear_report_zones, .module = THIS_MODULE, .ctr = linear_ctr, diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 8d7df8303d0a..7bb7174f8f4f 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -414,7 +414,7 @@ static int log_super(struct log_writes_c *lc) } /* - * Super sector should be writen in-order, otherwise the + * Super sector should be written in-order, otherwise the * nr_entries could be rewritten incorrectly by an old bio. */ wait_for_completion_io(&lc->super_done); @@ -818,7 +818,9 @@ static void log_writes_status(struct dm_target *ti, status_type_t type, } static int log_writes_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev) + struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct log_writes_c *lc = ti->private; struct dm_dev *dev = lc->dev; @@ -891,7 +893,7 @@ static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti, static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, - pfn_t *pfn) + unsigned long *pfn) { struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 637977acc3dc..aaf4a0a4b0eb 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -79,6 +79,7 @@ struct multipath { struct pgpath *current_pgpath; struct priority_group *current_pg; struct priority_group *next_pg; /* Switch to this PG if set */ + struct priority_group *last_probed_pg; atomic_t nr_valid_paths; /* Total number of usable paths */ unsigned int nr_priority_groups; @@ -87,6 +88,7 @@ struct multipath { const char *hw_handler_name; char *hw_handler_params; wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + wait_queue_head_t probe_wait; /* Wait for probing paths */ unsigned int pg_init_retries; /* Number of times to retry pg_init */ unsigned int pg_init_delay_msecs; /* Number of msecs before pg_init retry */ atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ @@ -100,6 +102,7 @@ struct multipath { struct bio_list queued_bios; struct timer_list nopath_timer; /* Timeout for queue_if_no_path */ + bool is_suspending; }; /* @@ -132,6 +135,8 @@ static void queue_if_no_path_timeout_work(struct timer_list *t); #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ +#define MPATHF_DELAY_PG_SWITCH 7 /* Delay switching pg if it still has paths */ +#define MPATHF_NEED_PG_SWITCH 8 /* Need to switch pgs after the delay has ended */ static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m) { @@ -254,6 +259,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) atomic_set(&m->pg_init_count, 0); m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; init_waitqueue_head(&m->pg_init_wait); + init_waitqueue_head(&m->probe_wait); return 0; } @@ -413,13 +419,21 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) goto failed; } + /* Don't change PG until it has no remaining paths */ + pg = READ_ONCE(m->current_pg); + if (pg) { + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) + return pgpath; + } + /* Were we instructed to switch PG? */ if (READ_ONCE(m->next_pg)) { spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; if (!pg) { spin_unlock_irqrestore(&m->lock, flags); - goto check_current_pg; + goto check_all_pgs; } m->next_pg = NULL; spin_unlock_irqrestore(&m->lock, flags); @@ -427,16 +441,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) if (!IS_ERR_OR_NULL(pgpath)) return pgpath; } - - /* Don't change PG until it has no remaining paths */ -check_current_pg: - pg = READ_ONCE(m->current_pg); - if (pg) { - pgpath = choose_path_in_pg(m, pg, nr_bytes); - if (!IS_ERR_OR_NULL(pgpath)) - return pgpath; - } - +check_all_pgs: /* * Loop through priority groups until we find a valid path. * First time we skip PGs marked 'bypassed'. @@ -612,7 +617,6 @@ static void multipath_queue_bio(struct multipath *m, struct bio *bio) static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) { struct pgpath *pgpath; - unsigned long flags; /* Do we need to select a new pgpath? */ pgpath = READ_ONCE(m->current_pgpath); @@ -620,12 +624,12 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) pgpath = choose_pgpath(m, bio->bi_iter.bi_size); if (!pgpath) { - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { __multipath_queue_bio(m, bio); pgpath = ERR_PTR(-EAGAIN); } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) || mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) { @@ -688,7 +692,6 @@ static void process_queued_io_list(struct multipath *m) static void process_queued_bios(struct work_struct *work) { int r; - unsigned long flags; struct bio *bio; struct bio_list bios; struct blk_plug plug; @@ -697,16 +700,16 @@ static void process_queued_bios(struct work_struct *work) bio_list_init(&bios); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (bio_list_empty(&m->queued_bios)) { - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); return; } bio_list_merge_init(&bios, &m->queued_bios); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { @@ -787,7 +790,7 @@ static int queue_if_no_path(struct multipath *m, bool f_queue_if_no_path, */ static void queue_if_no_path_timeout_work(struct timer_list *t) { - struct multipath *m = from_timer(m, t, nopath_timer); + struct multipath *m = timer_container_of(m, t, nopath_timer); DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_table_device_name(m->ti->table)); @@ -815,7 +818,7 @@ static void enable_nopath_timeout(struct multipath *m) static void disable_nopath_timeout(struct multipath *m) { - del_timer_sync(&m->nopath_timer); + timer_delete_sync(&m->nopath_timer); } /* @@ -1190,7 +1193,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct dm_arg_set as; unsigned int pg_count = 0; unsigned int next_pg_num; - unsigned long flags; as.argc = argc; as.argv = argv; @@ -1255,9 +1257,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); enable_nopath_timeout(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); ti->num_flush_bios = 1; ti->num_discard_bios = 1; @@ -1292,23 +1294,21 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m) static void flush_multipath_work(struct multipath *m) { if (m->hw_handler_name) { - unsigned long flags; - if (!atomic_read(&m->pg_init_in_progress)) goto skip; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (atomic_read(&m->pg_init_in_progress) && !test_and_set_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) { - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } skip: if (m->queue_mode == DM_TYPE_BIO_BASED) @@ -1370,11 +1370,10 @@ out: static int reinstate_path(struct pgpath *pgpath) { int r = 0, run_queue = 0; - unsigned long flags; struct multipath *m = pgpath->pg->m; unsigned int nr_valid_paths; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (pgpath->is_active) goto out; @@ -1404,7 +1403,7 @@ static int reinstate_path(struct pgpath *pgpath) schedule_work(&m->trigger_event); out: - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); if (run_queue) { dm_table_run_md_queue_async(m->ti->table); process_queued_io_list(m); @@ -1439,15 +1438,19 @@ static int action_dev(struct multipath *m, dev_t dev, action_fn action) * Temporarily try to avoid having to use the specified PG */ static void bypass_pg(struct multipath *m, struct priority_group *pg, - bool bypassed) + bool bypassed, bool can_be_delayed) { unsigned long flags; spin_lock_irqsave(&m->lock, flags); pg->bypassed = bypassed; - m->current_pgpath = NULL; - m->current_pg = NULL; + if (can_be_delayed && test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); + else { + m->current_pgpath = NULL; + m->current_pg = NULL; + } spin_unlock_irqrestore(&m->lock, flags); @@ -1461,7 +1464,6 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) { struct priority_group *pg; unsigned int pgnum; - unsigned long flags; char dummy; if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || @@ -1470,17 +1472,21 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) return -EINVAL; } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); list_for_each_entry(pg, &m->priority_groups, list) { pg->bypassed = false; if (--pgnum) continue; - m->current_pgpath = NULL; - m->current_pg = NULL; + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); + else { + m->current_pgpath = NULL; + m->current_pg = NULL; + } m->next_pg = pg; } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); schedule_work(&m->trigger_event); return 0; @@ -1507,7 +1513,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) break; } - bypass_pg(m, pg, bypassed); + bypass_pg(m, pg, bypassed, true); return 0; } @@ -1561,7 +1567,7 @@ static void pg_init_done(void *data, int errors) * Probably doing something like FW upgrade on the * controller so try the other pg. */ - bypass_pg(m, pg, true); + bypass_pg(m, pg, true, false); break; case SCSI_DH_RETRY: /* Wait before retrying. */ @@ -1742,6 +1748,9 @@ static void multipath_presuspend(struct dm_target *ti) { struct multipath *m = ti->private; + spin_lock_irq(&m->lock); + m->is_suspending = true; + spin_unlock_irq(&m->lock); /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) queue_if_no_path(m, false, true, __func__); @@ -1762,9 +1771,9 @@ static void multipath_postsuspend(struct dm_target *ti) static void multipath_resume(struct dm_target *ti) { struct multipath *m = ti->private; - unsigned long flags; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); + m->is_suspending = false; if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); @@ -1775,7 +1784,7 @@ static void multipath_resume(struct dm_target *ti) test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags), test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } /* @@ -1798,14 +1807,13 @@ static void multipath_status(struct dm_target *ti, status_type_t type, unsigned int status_flags, char *result, unsigned int maxlen) { int sz = 0, pg_counter, pgpath_counter; - unsigned long flags; struct multipath *m = ti->private; struct priority_group *pg; struct pgpath *p; unsigned int pg_num; char state; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); /* Features */ if (type == STATUSTYPE_INFO) @@ -1845,10 +1853,10 @@ static void multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("%u ", m->nr_priority_groups); - if (m->next_pg) - pg_num = m->next_pg->pg_num; - else if (m->current_pg) + if (m->current_pg) pg_num = m->current_pg->pg_num; + else if (m->next_pg) + pg_num = m->next_pg->pg_num; else pg_num = (m->nr_priority_groups ? 1 : 0); @@ -1951,7 +1959,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type, break; } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } static int multipath_message(struct dm_target *ti, unsigned int argc, char **argv, @@ -1961,7 +1969,6 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg dev_t dev; struct multipath *m = ti->private; action_fn action; - unsigned long flags; mutex_lock(&m->work_mutex); @@ -1973,9 +1980,9 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg if (argc == 1) { if (!strcasecmp(argv[0], "queue_if_no_path")) { r = queue_if_no_path(m, true, false, __func__); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); enable_nopath_timeout(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); goto out; } else if (!strcasecmp(argv[0], "fail_if_no_path")) { r = queue_if_no_path(m, false, false, __func__); @@ -2021,14 +2028,132 @@ out: return r; } +/* + * Perform a minimal read from the given path to find out whether the + * path still works. If a path error occurs, fail it. + */ +static int probe_path(struct pgpath *pgpath) +{ + struct block_device *bdev = pgpath->path.dev->bdev; + unsigned int read_size = bdev_logical_block_size(bdev); + struct page *page; + struct bio *bio; + blk_status_t status; + int r = 0; + + if (WARN_ON_ONCE(read_size > PAGE_SIZE)) + return -EINVAL; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + /* Perform a minimal read: Sector 0, length read_size */ + bio = bio_alloc(bdev, 1, REQ_OP_READ, GFP_KERNEL); + if (!bio) { + r = -ENOMEM; + goto out; + } + + bio->bi_iter.bi_sector = 0; + __bio_add_page(bio, page, read_size, 0); + submit_bio_wait(bio); + status = bio->bi_status; + bio_put(bio); + + if (status && blk_path_error(status)) + fail_path(pgpath); + +out: + __free_page(page); + return r; +} + +/* + * Probe all active paths in current_pg to find out whether they still work. + * Fail all paths that do not work. + * + * Return -ENOTCONN if no valid path is left (even outside of current_pg). We + * cannot probe paths in other pgs without switching current_pg, so if valid + * paths are only in different pgs, they may or may not work. Additionally + * we should not probe paths in a pathgroup that is in the process of + * Initializing. Userspace can submit a request and we'll switch and wait + * for the pathgroup to be initialized. If the request fails, it may need to + * probe again. + */ +static int probe_active_paths(struct multipath *m) +{ + struct pgpath *pgpath; + struct priority_group *pg = NULL; + int r = 0; + + spin_lock_irq(&m->lock); + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) { + wait_event_lock_irq(m->probe_wait, + !test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags), + m->lock); + /* + * if we waited because a probe was already in progress, + * and it probed the current active pathgroup, don't + * reprobe. Just return the number of valid paths + */ + if (m->current_pg == m->last_probed_pg) + goto skip_probe; + } + if (!m->current_pg || m->is_suspending || + test_bit(MPATHF_QUEUE_IO, &m->flags)) + goto skip_probe; + set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); + pg = m->last_probed_pg = m->current_pg; + spin_unlock_irq(&m->lock); + + list_for_each_entry(pgpath, &pg->pgpaths, list) { + if (pg != READ_ONCE(m->current_pg) || + READ_ONCE(m->is_suspending)) + goto out; + if (!pgpath->is_active) + continue; + + r = probe_path(pgpath); + if (r < 0) + goto out; + } + +out: + spin_lock_irq(&m->lock); + clear_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); + if (test_and_clear_bit(MPATHF_NEED_PG_SWITCH, &m->flags)) { + m->current_pgpath = NULL; + m->current_pg = NULL; + } +skip_probe: + if (r == 0 && !atomic_read(&m->nr_valid_paths)) + r = -ENOTCONN; + spin_unlock_irq(&m->lock); + if (pg) + wake_up(&m->probe_wait); + return r; +} + static int multipath_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev) + struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct multipath *m = ti->private; struct pgpath *pgpath; - unsigned long flags; int r; + if (_IOC_TYPE(cmd) == DM_IOCTL) { + *forward = false; + switch (cmd) { + case DM_MPATH_PROBE_PATHS: + return probe_active_paths(m); + default: + return -ENOTTY; + } + } + pgpath = READ_ONCE(m->current_pgpath); if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) pgpath = choose_pgpath(m, 0); @@ -2044,10 +2169,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } else { /* No path is available */ r = -EIO; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) r = -ENOTCONN; - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } if (r == -ENOTCONN) { @@ -2055,10 +2180,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti, /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) (void) __pg_init_all_paths(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); dm_table_run_md_queue_async(m->ti->table); process_queued_io_list(m); } @@ -2180,7 +2305,7 @@ static int multipath_busy(struct dm_target *ti) */ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 14, 0}, + .version = {1, 15, 0}, .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE | DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c index 3e4cb81ce512..d0b883fabfeb 100644 --- a/drivers/md/dm-path-selector.c +++ b/drivers/md/dm-path-selector.c @@ -117,16 +117,16 @@ int dm_register_path_selector(struct path_selector_type *pst) } EXPORT_SYMBOL_GPL(dm_register_path_selector); -int dm_unregister_path_selector(struct path_selector_type *pst) +void dm_unregister_path_selector(struct path_selector_type *pst) { struct ps_internal *psi; down_write(&_ps_lock); psi = __find_path_selector_type(pst->name); - if (!psi) { + if (WARN_ON(!psi)) { up_write(&_ps_lock); - return -EINVAL; + return; } list_del(&psi->list); @@ -134,7 +134,5 @@ int dm_unregister_path_selector(struct path_selector_type *pst) up_write(&_ps_lock); kfree(psi); - - return 0; } EXPORT_SYMBOL_GPL(dm_unregister_path_selector); diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 3861b2d8b963..7b2270532e64 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -96,7 +96,7 @@ struct path_selector_type { int dm_register_path_selector(struct path_selector_type *type); /* Unregister a path selector */ -int dm_unregister_path_selector(struct path_selector_type *type); +void dm_unregister_path_selector(struct path_selector_type *type); /* Returns a registered path selector type */ struct path_selector_type *dm_get_path_selector(const char *name); diff --git a/drivers/md/dm-pcache/Kconfig b/drivers/md/dm-pcache/Kconfig new file mode 100644 index 000000000000..0e251eca892e --- /dev/null +++ b/drivers/md/dm-pcache/Kconfig @@ -0,0 +1,17 @@ +config DM_PCACHE + tristate "Persistent cache for Block Device (Experimental)" + depends on BLK_DEV_DM + depends on DEV_DAX + help + PCACHE provides a mechanism to use persistent memory (e.g., CXL persistent memory, + DAX-enabled devices) as a high-performance cache layer in front of + traditional block devices such as SSDs or HDDs. + + PCACHE is implemented as a kernel module that integrates with the block + layer and supports direct access (DAX) to persistent memory for low-latency, + byte-addressable caching. + + Note: This feature is experimental and should be tested thoroughly + before use in production environments. + + If unsure, say 'N'. diff --git a/drivers/md/dm-pcache/Makefile b/drivers/md/dm-pcache/Makefile new file mode 100644 index 000000000000..cedfd38854f6 --- /dev/null +++ b/drivers/md/dm-pcache/Makefile @@ -0,0 +1,3 @@ +dm-pcache-y := dm_pcache.o cache_dev.o segment.o backing_dev.o cache.o cache_gc.o cache_writeback.o cache_segment.o cache_key.o cache_req.o + +obj-$(CONFIG_DM_PCACHE) += dm-pcache.o diff --git a/drivers/md/dm-pcache/backing_dev.c b/drivers/md/dm-pcache/backing_dev.c new file mode 100644 index 000000000000..7165fc0364bb --- /dev/null +++ b/drivers/md/dm-pcache/backing_dev.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/blkdev.h> + +#include "../dm-core.h" +#include "pcache_internal.h" +#include "cache_dev.h" +#include "backing_dev.h" +#include "cache.h" +#include "dm_pcache.h" + +static struct kmem_cache *backing_req_cache; +static struct kmem_cache *backing_bvec_cache; + +static void backing_dev_exit(struct pcache_backing_dev *backing_dev) +{ + mempool_exit(&backing_dev->req_pool); + mempool_exit(&backing_dev->bvec_pool); +} + +static void req_submit_fn(struct work_struct *work); +static void req_complete_fn(struct work_struct *work); +static int backing_dev_init(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + int ret; + + ret = mempool_init_slab_pool(&backing_dev->req_pool, 128, backing_req_cache); + if (ret) + goto err; + + ret = mempool_init_slab_pool(&backing_dev->bvec_pool, 128, backing_bvec_cache); + if (ret) + goto req_pool_exit; + + INIT_LIST_HEAD(&backing_dev->submit_list); + INIT_LIST_HEAD(&backing_dev->complete_list); + spin_lock_init(&backing_dev->submit_lock); + spin_lock_init(&backing_dev->complete_lock); + INIT_WORK(&backing_dev->req_submit_work, req_submit_fn); + INIT_WORK(&backing_dev->req_complete_work, req_complete_fn); + atomic_set(&backing_dev->inflight_reqs, 0); + init_waitqueue_head(&backing_dev->inflight_wq); + + return 0; + +req_pool_exit: + mempool_exit(&backing_dev->req_pool); +err: + return ret; +} + +int backing_dev_start(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + int ret; + + ret = backing_dev_init(pcache); + if (ret) + return ret; + + backing_dev->dev_size = bdev_nr_sectors(backing_dev->dm_dev->bdev); + + return 0; +} + +void backing_dev_stop(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + + /* + * There should not be any new request comming, just wait + * inflight requests done. + */ + wait_event(backing_dev->inflight_wq, + atomic_read(&backing_dev->inflight_reqs) == 0); + + flush_work(&backing_dev->req_submit_work); + flush_work(&backing_dev->req_complete_work); + + backing_dev_exit(backing_dev); +} + +/* pcache_backing_dev_req functions */ +void backing_dev_req_end(struct pcache_backing_dev_req *backing_req) +{ + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + + if (backing_req->end_req) + backing_req->end_req(backing_req, backing_req->ret); + + switch (backing_req->type) { + case BACKING_DEV_REQ_TYPE_REQ: + if (backing_req->req.upper_req) + pcache_req_put(backing_req->req.upper_req, backing_req->ret); + break; + case BACKING_DEV_REQ_TYPE_KMEM: + if (backing_req->kmem.bvecs != backing_req->kmem.inline_bvecs) + mempool_free(backing_req->kmem.bvecs, &backing_dev->bvec_pool); + break; + default: + BUG(); + } + + mempool_free(backing_req, &backing_dev->req_pool); + + if (atomic_dec_and_test(&backing_dev->inflight_reqs)) + wake_up(&backing_dev->inflight_wq); +} + +static void req_complete_fn(struct work_struct *work) +{ + struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_complete_work); + struct pcache_backing_dev_req *backing_req; + LIST_HEAD(tmp_list); + + spin_lock_irq(&backing_dev->complete_lock); + list_splice_init(&backing_dev->complete_list, &tmp_list); + spin_unlock_irq(&backing_dev->complete_lock); + + while (!list_empty(&tmp_list)) { + backing_req = list_first_entry(&tmp_list, + struct pcache_backing_dev_req, node); + list_del_init(&backing_req->node); + backing_dev_req_end(backing_req); + } +} + +static void backing_dev_bio_end(struct bio *bio) +{ + struct pcache_backing_dev_req *backing_req = bio->bi_private; + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + unsigned long flags; + + backing_req->ret = blk_status_to_errno(bio->bi_status); + + spin_lock_irqsave(&backing_dev->complete_lock, flags); + list_move_tail(&backing_req->node, &backing_dev->complete_list); + queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_complete_work); + spin_unlock_irqrestore(&backing_dev->complete_lock, flags); +} + +static void req_submit_fn(struct work_struct *work) +{ + struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_submit_work); + struct pcache_backing_dev_req *backing_req; + LIST_HEAD(tmp_list); + + spin_lock(&backing_dev->submit_lock); + list_splice_init(&backing_dev->submit_list, &tmp_list); + spin_unlock(&backing_dev->submit_lock); + + while (!list_empty(&tmp_list)) { + backing_req = list_first_entry(&tmp_list, + struct pcache_backing_dev_req, node); + list_del_init(&backing_req->node); + submit_bio_noacct(&backing_req->bio); + } +} + +void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct) +{ + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + + if (direct) { + submit_bio_noacct(&backing_req->bio); + return; + } + + spin_lock(&backing_dev->submit_lock); + list_add_tail(&backing_req->node, &backing_dev->submit_list); + queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_submit_work); + spin_unlock(&backing_dev->submit_lock); +} + +static void bio_map(struct bio *bio, void *base, size_t size) +{ + struct page *page; + unsigned int offset; + unsigned int len; + + if (!is_vmalloc_addr(base)) { + page = virt_to_page(base); + offset = offset_in_page(base); + + BUG_ON(!bio_add_page(bio, page, size, offset)); + return; + } + + flush_kernel_vmap_range(base, size); + while (size) { + page = vmalloc_to_page(base); + offset = offset_in_page(base); + len = min_t(size_t, PAGE_SIZE - offset, size); + + BUG_ON(!bio_add_page(bio, page, len, offset)); + size -= len; + base += len; + } +} + +static struct pcache_backing_dev_req *req_type_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_request *pcache_req = opts->req.upper_req; + struct pcache_backing_dev_req *backing_req; + struct bio *orig = pcache_req->bio; + + backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask); + if (!backing_req) + return NULL; + + memset(backing_req, 0, sizeof(struct pcache_backing_dev_req)); + + bio_init_clone(backing_dev->dm_dev->bdev, &backing_req->bio, orig, opts->gfp_mask); + + backing_req->type = BACKING_DEV_REQ_TYPE_REQ; + backing_req->backing_dev = backing_dev; + atomic_inc(&backing_dev->inflight_reqs); + + return backing_req; +} + +static struct pcache_backing_dev_req *kmem_type_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_backing_dev_req *backing_req; + u32 n_vecs = bio_add_max_vecs(opts->kmem.data, opts->kmem.len); + + backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask); + if (!backing_req) + return NULL; + + memset(backing_req, 0, sizeof(struct pcache_backing_dev_req)); + + if (n_vecs > BACKING_DEV_REQ_INLINE_BVECS) { + backing_req->kmem.bvecs = mempool_alloc(&backing_dev->bvec_pool, opts->gfp_mask); + if (!backing_req->kmem.bvecs) + goto free_backing_req; + } else { + backing_req->kmem.bvecs = backing_req->kmem.inline_bvecs; + } + + backing_req->kmem.n_vecs = n_vecs; + backing_req->type = BACKING_DEV_REQ_TYPE_KMEM; + backing_req->backing_dev = backing_dev; + atomic_inc(&backing_dev->inflight_reqs); + + return backing_req; + +free_backing_req: + mempool_free(backing_req, &backing_dev->req_pool); + return NULL; +} + +struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + if (opts->type == BACKING_DEV_REQ_TYPE_REQ) + return req_type_req_alloc(backing_dev, opts); + + if (opts->type == BACKING_DEV_REQ_TYPE_KMEM) + return kmem_type_req_alloc(backing_dev, opts); + + BUG(); +} + +static void req_type_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_request *pcache_req = opts->req.upper_req; + struct bio *clone; + u32 off = opts->req.req_off; + u32 len = opts->req.len; + + clone = &backing_req->bio; + BUG_ON(off & SECTOR_MASK); + BUG_ON(len & SECTOR_MASK); + bio_trim(clone, off >> SECTOR_SHIFT, len >> SECTOR_SHIFT); + + clone->bi_iter.bi_sector = (pcache_req->off + off) >> SECTOR_SHIFT; + clone->bi_private = backing_req; + clone->bi_end_io = backing_dev_bio_end; + + INIT_LIST_HEAD(&backing_req->node); + backing_req->end_req = opts->end_fn; + + pcache_req_get(pcache_req); + backing_req->req.upper_req = pcache_req; + backing_req->req.bio_off = off; +} + +static void kmem_type_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + struct bio *backing_bio; + + bio_init(&backing_req->bio, backing_dev->dm_dev->bdev, backing_req->kmem.bvecs, + backing_req->kmem.n_vecs, opts->kmem.opf); + + backing_bio = &backing_req->bio; + bio_map(backing_bio, opts->kmem.data, opts->kmem.len); + + backing_bio->bi_iter.bi_sector = (opts->kmem.backing_off) >> SECTOR_SHIFT; + backing_bio->bi_private = backing_req; + backing_bio->bi_end_io = backing_dev_bio_end; + + INIT_LIST_HEAD(&backing_req->node); + backing_req->end_req = opts->end_fn; + backing_req->priv_data = opts->priv_data; +} + +void backing_dev_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts) +{ + if (opts->type == BACKING_DEV_REQ_TYPE_REQ) + return req_type_req_init(backing_req, opts); + + if (opts->type == BACKING_DEV_REQ_TYPE_KMEM) + return kmem_type_req_init(backing_req, opts); + + BUG(); +} + +struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_backing_dev_req *backing_req; + + backing_req = backing_dev_req_alloc(backing_dev, opts); + if (!backing_req) + return NULL; + + backing_dev_req_init(backing_req, opts); + + return backing_req; +} + +void backing_dev_flush(struct pcache_backing_dev *backing_dev) +{ + blkdev_issue_flush(backing_dev->dm_dev->bdev); +} + +int pcache_backing_init(void) +{ + u32 max_bvecs = (PCACHE_CACHE_SUBTREE_SIZE >> PAGE_SHIFT) + 1; + int ret; + + backing_req_cache = KMEM_CACHE(pcache_backing_dev_req, 0); + if (!backing_req_cache) { + ret = -ENOMEM; + goto err; + } + + backing_bvec_cache = kmem_cache_create("pcache-bvec-slab", + max_bvecs * sizeof(struct bio_vec), + 0, 0, NULL); + if (!backing_bvec_cache) { + ret = -ENOMEM; + goto destroy_req_cache; + } + + return 0; +destroy_req_cache: + kmem_cache_destroy(backing_req_cache); +err: + return ret; +} + +void pcache_backing_exit(void) +{ + kmem_cache_destroy(backing_bvec_cache); + kmem_cache_destroy(backing_req_cache); +} diff --git a/drivers/md/dm-pcache/backing_dev.h b/drivers/md/dm-pcache/backing_dev.h new file mode 100644 index 000000000000..b371cba483b9 --- /dev/null +++ b/drivers/md/dm-pcache/backing_dev.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _BACKING_DEV_H +#define _BACKING_DEV_H + +#include <linux/device-mapper.h> + +#include "pcache_internal.h" + +struct pcache_backing_dev_req; +typedef void (*backing_req_end_fn_t)(struct pcache_backing_dev_req *backing_req, int ret); + +#define BACKING_DEV_REQ_TYPE_REQ 1 +#define BACKING_DEV_REQ_TYPE_KMEM 2 + +#define BACKING_DEV_REQ_INLINE_BVECS 4 + +struct pcache_request; +struct pcache_backing_dev_req { + u8 type; + struct bio bio; + struct pcache_backing_dev *backing_dev; + + void *priv_data; + backing_req_end_fn_t end_req; + + struct list_head node; + int ret; + + union { + struct { + struct pcache_request *upper_req; + u32 bio_off; + } req; + struct { + struct bio_vec inline_bvecs[BACKING_DEV_REQ_INLINE_BVECS]; + struct bio_vec *bvecs; + u32 n_vecs; + } kmem; + }; +}; + +struct pcache_backing_dev { + struct pcache_cache *cache; + + struct dm_dev *dm_dev; + mempool_t req_pool; + mempool_t bvec_pool; + + struct list_head submit_list; + spinlock_t submit_lock; + struct work_struct req_submit_work; + + struct list_head complete_list; + spinlock_t complete_lock; + struct work_struct req_complete_work; + + atomic_t inflight_reqs; + wait_queue_head_t inflight_wq; + + u64 dev_size; +}; + +struct dm_pcache; +int backing_dev_start(struct dm_pcache *pcache); +void backing_dev_stop(struct dm_pcache *pcache); + +struct pcache_backing_dev_req_opts { + u32 type; + union { + struct { + struct pcache_request *upper_req; + u32 req_off; + u32 len; + } req; + struct { + void *data; + blk_opf_t opf; + u32 len; + u64 backing_off; + } kmem; + }; + + gfp_t gfp_mask; + backing_req_end_fn_t end_fn; + void *priv_data; +}; + +static inline u32 backing_dev_req_coalesced_max_len(const void *data, u32 len) +{ + const void *p = data; + u32 done = 0, in_page, to_advance; + struct page *first_page, *next_page; + + if (!is_vmalloc_addr(data)) + return len; + + first_page = vmalloc_to_page(p); +advance: + in_page = PAGE_SIZE - offset_in_page(p); + to_advance = min_t(u32, in_page, len - done); + + done += to_advance; + p += to_advance; + + if (done == len) + return done; + + next_page = vmalloc_to_page(p); + if (zone_device_pages_have_same_pgmap(first_page, next_page)) + goto advance; + + return done; +} + +void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct); +void backing_dev_req_end(struct pcache_backing_dev_req *backing_req); +struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts); +struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts); +void backing_dev_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts); +void backing_dev_flush(struct pcache_backing_dev *backing_dev); + +int pcache_backing_init(void); +void pcache_backing_exit(void); +#endif /* _BACKING_DEV_H */ diff --git a/drivers/md/dm-pcache/cache.c b/drivers/md/dm-pcache/cache.c new file mode 100644 index 000000000000..698697a7a73c --- /dev/null +++ b/drivers/md/dm-pcache/cache.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/blk_types.h> + +#include "cache.h" +#include "cache_dev.h" +#include "backing_dev.h" +#include "dm_pcache.h" + +struct kmem_cache *key_cache; + +static inline struct pcache_cache_info *get_cache_info_addr(struct pcache_cache *cache) +{ + return cache->cache_info_addr + cache->info_index; +} + +static void cache_info_write(struct pcache_cache *cache) +{ + struct pcache_cache_info *cache_info = &cache->cache_info; + + cache_info->header.seq++; + cache_info->header.crc = pcache_meta_crc(&cache_info->header, + sizeof(struct pcache_cache_info)); + + memcpy_flushcache(get_cache_info_addr(cache), cache_info, + sizeof(struct pcache_cache_info)); + + cache->info_index = (cache->info_index + 1) % PCACHE_META_INDEX_MAX; +} + +static void cache_info_init_default(struct pcache_cache *cache); +static int cache_info_init(struct pcache_cache *cache, struct pcache_cache_options *opts) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_info *cache_info_addr; + + cache_info_addr = pcache_meta_find_latest(&cache->cache_info_addr->header, + sizeof(struct pcache_cache_info), + PCACHE_CACHE_INFO_SIZE, + &cache->cache_info); + if (IS_ERR(cache_info_addr)) + return PTR_ERR(cache_info_addr); + + if (cache_info_addr) { + if (opts->data_crc != + (cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC)) { + pcache_dev_err(pcache, "invalid option for data_crc: %s, expected: %s", + opts->data_crc ? "true" : "false", + cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC ? "true" : "false"); + return -EINVAL; + } + + return 0; + } + + /* init cache_info for new cache */ + cache_info_init_default(cache); + cache_mode_set(cache, opts->cache_mode); + if (opts->data_crc) + cache->cache_info.flags |= PCACHE_CACHE_FLAGS_DATA_CRC; + + return 0; +} + +static void cache_info_set_gc_percent(struct pcache_cache_info *cache_info, u8 percent) +{ + cache_info->flags &= ~PCACHE_CACHE_FLAGS_GC_PERCENT_MASK; + cache_info->flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, percent); +} + +int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent) +{ + if (percent > PCACHE_CACHE_GC_PERCENT_MAX || percent < PCACHE_CACHE_GC_PERCENT_MIN) + return -EINVAL; + + mutex_lock(&cache->cache_info_lock); + cache_info_set_gc_percent(&cache->cache_info, percent); + + cache_info_write(cache); + mutex_unlock(&cache->cache_info_lock); + + return 0; +} + +void cache_pos_encode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia_base, + struct pcache_cache_pos *pos, u64 seq, u32 *index) +{ + struct pcache_cache_pos_onmedia pos_onmedia; + struct pcache_cache_pos_onmedia *pos_onmedia_addr = pos_onmedia_base + *index; + + pos_onmedia.cache_seg_id = pos->cache_seg->cache_seg_id; + pos_onmedia.seg_off = pos->seg_off; + pos_onmedia.header.seq = seq; + pos_onmedia.header.crc = cache_pos_onmedia_crc(&pos_onmedia); + + memcpy_flushcache(pos_onmedia_addr, &pos_onmedia, sizeof(struct pcache_cache_pos_onmedia)); + pmem_wmb(); + + *index = (*index + 1) % PCACHE_META_INDEX_MAX; +} + +int cache_pos_decode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia, + struct pcache_cache_pos *pos, u64 *seq, u32 *index) +{ + struct pcache_cache_pos_onmedia latest, *latest_addr; + + latest_addr = pcache_meta_find_latest(&pos_onmedia->header, + sizeof(struct pcache_cache_pos_onmedia), + sizeof(struct pcache_cache_pos_onmedia), + &latest); + if (IS_ERR(latest_addr)) + return PTR_ERR(latest_addr); + + if (!latest_addr) + return -EIO; + + pos->cache_seg = &cache->segments[latest.cache_seg_id]; + pos->seg_off = latest.seg_off; + *seq = latest.header.seq; + *index = (latest_addr - pos_onmedia); + + return 0; +} + +static inline void cache_info_set_seg_id(struct pcache_cache *cache, u32 seg_id) +{ + cache->cache_info.seg_id = seg_id; +} + +static int cache_init(struct dm_pcache *pcache) +{ + struct pcache_cache *cache = &pcache->cache; + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + int ret; + + cache->segments = kvcalloc(cache_dev->seg_num, sizeof(struct pcache_cache_segment), GFP_KERNEL); + if (!cache->segments) { + ret = -ENOMEM; + goto err; + } + + cache->seg_map = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL); + if (!cache->seg_map) { + ret = -ENOMEM; + goto free_segments; + } + + cache->backing_dev = backing_dev; + cache->cache_dev = &pcache->cache_dev; + cache->n_segs = cache_dev->seg_num; + atomic_set(&cache->gc_errors, 0); + spin_lock_init(&cache->seg_map_lock); + spin_lock_init(&cache->key_head_lock); + + mutex_init(&cache->cache_info_lock); + mutex_init(&cache->key_tail_lock); + mutex_init(&cache->dirty_tail_lock); + mutex_init(&cache->writeback_lock); + + INIT_DELAYED_WORK(&cache->writeback_work, cache_writeback_fn); + INIT_DELAYED_WORK(&cache->gc_work, pcache_cache_gc_fn); + INIT_WORK(&cache->clean_work, clean_fn); + + return 0; + +free_segments: + kvfree(cache->segments); +err: + return ret; +} + +static void cache_exit(struct pcache_cache *cache) +{ + kvfree(cache->seg_map); + kvfree(cache->segments); +} + +static void cache_info_init_default(struct pcache_cache *cache) +{ + struct pcache_cache_info *cache_info = &cache->cache_info; + + memset(cache_info, 0, sizeof(*cache_info)); + cache_info->n_segs = cache->cache_dev->seg_num; + cache_info_set_gc_percent(cache_info, PCACHE_CACHE_GC_PERCENT_DEFAULT); +} + +static int cache_tail_init(struct pcache_cache *cache) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE); + + if (new_cache) { + __set_bit(0, cache->seg_map); + + cache->key_head.cache_seg = &cache->segments[0]; + cache->key_head.seg_off = 0; + cache_pos_copy(&cache->key_tail, &cache->key_head); + cache_pos_copy(&cache->dirty_tail, &cache->key_head); + + cache_encode_dirty_tail(cache); + cache_encode_key_tail(cache); + } else { + if (cache_decode_key_tail(cache) || cache_decode_dirty_tail(cache)) { + pcache_dev_err(pcache, "Corrupted key tail or dirty tail.\n"); + return -EIO; + } + } + + return 0; +} + +static int get_seg_id(struct pcache_cache *cache, + struct pcache_cache_segment *prev_cache_seg, + bool new_cache, u32 *seg_id) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_dev *cache_dev = cache->cache_dev; + int ret; + + if (new_cache) { + ret = cache_dev_get_empty_segment_id(cache_dev, seg_id); + if (ret) { + pcache_dev_err(pcache, "no available segment\n"); + goto err; + } + + if (prev_cache_seg) + cache_seg_set_next_seg(prev_cache_seg, *seg_id); + else + cache_info_set_seg_id(cache, *seg_id); + } else { + if (prev_cache_seg) { + struct pcache_segment_info *prev_seg_info; + + prev_seg_info = &prev_cache_seg->cache_seg_info; + if (!segment_info_has_next(prev_seg_info)) { + ret = -EFAULT; + goto err; + } + *seg_id = prev_cache_seg->cache_seg_info.next_seg; + } else { + *seg_id = cache->cache_info.seg_id; + } + } + return 0; +err: + return ret; +} + +static int cache_segs_init(struct pcache_cache *cache) +{ + struct pcache_cache_segment *prev_cache_seg = NULL; + struct pcache_cache_info *cache_info = &cache->cache_info; + bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE); + u32 seg_id; + int ret; + u32 i; + + for (i = 0; i < cache_info->n_segs; i++) { + ret = get_seg_id(cache, prev_cache_seg, new_cache, &seg_id); + if (ret) + goto err; + + ret = cache_seg_init(cache, seg_id, i, new_cache); + if (ret) + goto err; + + prev_cache_seg = &cache->segments[i]; + } + return 0; +err: + return ret; +} + +static int cache_init_req_keys(struct pcache_cache *cache, u32 n_paral) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + u32 n_subtrees; + int ret; + u32 i, cpu; + + /* Calculate number of cache trees based on the device size */ + n_subtrees = DIV_ROUND_UP(cache->dev_size << SECTOR_SHIFT, PCACHE_CACHE_SUBTREE_SIZE); + ret = cache_tree_init(cache, &cache->req_key_tree, n_subtrees); + if (ret) + goto err; + + cache->n_ksets = n_paral; + cache->ksets = kvcalloc(cache->n_ksets, PCACHE_KSET_SIZE, GFP_KERNEL); + if (!cache->ksets) { + ret = -ENOMEM; + goto req_tree_exit; + } + + /* + * Initialize each kset with a spinlock and delayed work for flushing. + * Each kset is associated with one queue to ensure independent handling + * of cache keys across multiple queues, maximizing multiqueue concurrency. + */ + for (i = 0; i < cache->n_ksets; i++) { + struct pcache_cache_kset *kset = get_kset(cache, i); + + kset->cache = cache; + spin_lock_init(&kset->kset_lock); + INIT_DELAYED_WORK(&kset->flush_work, kset_flush_fn); + } + + cache->data_heads = alloc_percpu(struct pcache_cache_data_head); + if (!cache->data_heads) { + ret = -ENOMEM; + goto free_kset; + } + + for_each_possible_cpu(cpu) { + struct pcache_cache_data_head *h = + per_cpu_ptr(cache->data_heads, cpu); + h->head_pos.cache_seg = NULL; + } + + /* + * Replay persisted cache keys using cache_replay. + * This function loads and replays cache keys from previously stored + * ksets, allowing the cache to restore its state after a restart. + */ + ret = cache_replay(cache); + if (ret) { + pcache_dev_err(pcache, "failed to replay keys\n"); + goto free_heads; + } + + return 0; + +free_heads: + free_percpu(cache->data_heads); +free_kset: + kvfree(cache->ksets); +req_tree_exit: + cache_tree_exit(&cache->req_key_tree); +err: + return ret; +} + +static void cache_destroy_req_keys(struct pcache_cache *cache) +{ + u32 i; + + for (i = 0; i < cache->n_ksets; i++) { + struct pcache_cache_kset *kset = get_kset(cache, i); + + cancel_delayed_work_sync(&kset->flush_work); + } + + free_percpu(cache->data_heads); + kvfree(cache->ksets); + cache_tree_exit(&cache->req_key_tree); +} + +int pcache_cache_start(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + struct pcache_cache *cache = &pcache->cache; + struct pcache_cache_options *opts = &pcache->opts; + int ret; + + ret = cache_init(pcache); + if (ret) + return ret; + + cache->cache_info_addr = CACHE_DEV_CACHE_INFO(cache->cache_dev); + cache->cache_ctrl = CACHE_DEV_CACHE_CTRL(cache->cache_dev); + backing_dev->cache = cache; + cache->dev_size = backing_dev->dev_size; + + ret = cache_info_init(cache, opts); + if (ret) + goto cache_exit; + + ret = cache_segs_init(cache); + if (ret) + goto cache_exit; + + ret = cache_tail_init(cache); + if (ret) + goto cache_exit; + + ret = cache_init_req_keys(cache, num_online_cpus()); + if (ret) + goto cache_exit; + + ret = cache_writeback_init(cache); + if (ret) + goto destroy_keys; + + cache->cache_info.flags |= PCACHE_CACHE_FLAGS_INIT_DONE; + cache_info_write(cache); + queue_delayed_work(cache_get_wq(cache), &cache->gc_work, 0); + + return 0; + +destroy_keys: + cache_destroy_req_keys(cache); +cache_exit: + cache_exit(cache); + + return ret; +} + +void pcache_cache_stop(struct dm_pcache *pcache) +{ + struct pcache_cache *cache = &pcache->cache; + + pcache_cache_flush(cache); + + cancel_delayed_work_sync(&cache->gc_work); + flush_work(&cache->clean_work); + cache_writeback_exit(cache); + + if (cache->req_key_tree.n_subtrees) + cache_destroy_req_keys(cache); + + cache_exit(cache); +} + +struct workqueue_struct *cache_get_wq(struct pcache_cache *cache) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + + return pcache->task_wq; +} + +int pcache_cache_init(void) +{ + key_cache = KMEM_CACHE(pcache_cache_key, 0); + if (!key_cache) + return -ENOMEM; + + return 0; +} + +void pcache_cache_exit(void) +{ + kmem_cache_destroy(key_cache); +} diff --git a/drivers/md/dm-pcache/cache.h b/drivers/md/dm-pcache/cache.h new file mode 100644 index 000000000000..27613b56be54 --- /dev/null +++ b/drivers/md/dm-pcache/cache.h @@ -0,0 +1,635 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_CACHE_H +#define _PCACHE_CACHE_H + +#include "segment.h" + +/* Garbage collection thresholds */ +#define PCACHE_CACHE_GC_PERCENT_MIN 0 /* Minimum GC percentage */ +#define PCACHE_CACHE_GC_PERCENT_MAX 90 /* Maximum GC percentage */ +#define PCACHE_CACHE_GC_PERCENT_DEFAULT 70 /* Default GC percentage */ + +#define PCACHE_CACHE_SUBTREE_SIZE (4 * PCACHE_MB) /* 4MB total tree size */ +#define PCACHE_CACHE_SUBTREE_SIZE_MASK 0x3FFFFF /* Mask for tree size */ +#define PCACHE_CACHE_SUBTREE_SIZE_SHIFT 22 /* Bit shift for tree size */ + +/* Maximum number of keys per key set */ +#define PCACHE_KSET_KEYS_MAX 128 +#define PCACHE_CACHE_SEGS_MAX (1024 * 1024) /* maximum cache size for each device is 16T */ +#define PCACHE_KSET_ONMEDIA_SIZE_MAX struct_size_t(struct pcache_cache_kset_onmedia, data, PCACHE_KSET_KEYS_MAX) +#define PCACHE_KSET_SIZE (sizeof(struct pcache_cache_kset) + sizeof(struct pcache_cache_key_onmedia) * PCACHE_KSET_KEYS_MAX) + +/* Maximum number of keys to clean in one round of clean_work */ +#define PCACHE_CLEAN_KEYS_MAX 10 + +/* Writeback and garbage collection intervals in jiffies */ +#define PCACHE_CACHE_WRITEBACK_INTERVAL (5 * HZ) +#define PCACHE_CACHE_GC_INTERVAL (5 * HZ) + +/* Macro to get the cache key structure from an rb_node pointer */ +#define CACHE_KEY(node) (container_of(node, struct pcache_cache_key, rb_node)) + +struct pcache_cache_pos_onmedia { + struct pcache_meta_header header; + __u32 cache_seg_id; + __u32 seg_off; +}; + +/* Offset and size definitions for cache segment control */ +#define PCACHE_CACHE_SEG_CTRL_OFF (PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX) +#define PCACHE_CACHE_SEG_CTRL_SIZE (4 * PCACHE_KB) + +struct pcache_cache_seg_gen { + struct pcache_meta_header header; + __u64 gen; +}; + +/* Control structure for cache segments */ +struct pcache_cache_seg_ctrl { + struct pcache_cache_seg_gen gen[PCACHE_META_INDEX_MAX]; + __u64 res[64]; +}; + +#define PCACHE_CACHE_FLAGS_DATA_CRC BIT(0) +#define PCACHE_CACHE_FLAGS_INIT_DONE BIT(1) + +#define PCACHE_CACHE_FLAGS_CACHE_MODE_MASK GENMASK(5, 2) +#define PCACHE_CACHE_MODE_WRITEBACK 0 +#define PCACHE_CACHE_MODE_WRITETHROUGH 1 +#define PCACHE_CACHE_MODE_WRITEAROUND 2 +#define PCACHE_CACHE_MODE_WRITEONLY 3 + +#define PCACHE_CACHE_FLAGS_GC_PERCENT_MASK GENMASK(12, 6) + +struct pcache_cache_info { + struct pcache_meta_header header; + __u32 seg_id; + __u32 n_segs; + __u32 flags; + __u32 reserved; +}; + +struct pcache_cache_pos { + struct pcache_cache_segment *cache_seg; + u32 seg_off; +}; + +struct pcache_cache_segment { + struct pcache_cache *cache; + u32 cache_seg_id; /* Index in cache->segments */ + struct pcache_segment segment; + atomic_t refs; + + struct pcache_segment_info cache_seg_info; + struct mutex info_lock; + u32 info_index; + + spinlock_t gen_lock; + u64 gen; + u64 gen_seq; + u32 gen_index; + + struct pcache_cache_seg_ctrl *cache_seg_ctrl; +}; + +/* rbtree for cache entries */ +struct pcache_cache_subtree { + struct rb_root root; + spinlock_t tree_lock; +}; + +struct pcache_cache_tree { + struct pcache_cache *cache; + u32 n_subtrees; + mempool_t key_pool; + struct pcache_cache_subtree *subtrees; +}; + +extern struct kmem_cache *key_cache; + +struct pcache_cache_key { + struct pcache_cache_tree *cache_tree; + struct pcache_cache_subtree *cache_subtree; + struct kref ref; + struct rb_node rb_node; + struct list_head list_node; + u64 off; + u32 len; + u32 flags; + struct pcache_cache_pos cache_pos; + u64 seg_gen; +}; + +#define PCACHE_CACHE_KEY_FLAGS_EMPTY BIT(0) +#define PCACHE_CACHE_KEY_FLAGS_CLEAN BIT(1) + +struct pcache_cache_key_onmedia { + __u64 off; + __u32 len; + __u32 flags; + __u32 cache_seg_id; + __u32 cache_seg_off; + __u64 seg_gen; + __u32 data_crc; + __u32 reserved; +}; + +struct pcache_cache_kset_onmedia { + __u32 crc; + union { + __u32 key_num; + __u32 next_cache_seg_id; + }; + __u64 magic; + __u64 flags; + struct pcache_cache_key_onmedia data[]; +}; + +struct pcache_cache { + struct pcache_backing_dev *backing_dev; + struct pcache_cache_dev *cache_dev; + struct pcache_cache_ctrl *cache_ctrl; + u64 dev_size; + + struct pcache_cache_data_head __percpu *data_heads; + + spinlock_t key_head_lock; + struct pcache_cache_pos key_head; + u32 n_ksets; + struct pcache_cache_kset *ksets; + + struct mutex key_tail_lock; + struct pcache_cache_pos key_tail; + u64 key_tail_seq; + u32 key_tail_index; + + struct mutex dirty_tail_lock; + struct pcache_cache_pos dirty_tail; + u64 dirty_tail_seq; + u32 dirty_tail_index; + + struct pcache_cache_tree req_key_tree; + struct work_struct clean_work; + + struct mutex writeback_lock; + char wb_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX]; + struct pcache_cache_tree writeback_key_tree; + struct delayed_work writeback_work; + struct { + atomic_t pending; + u32 advance; + int ret; + } writeback_ctx; + + char gc_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX]; + struct delayed_work gc_work; + atomic_t gc_errors; + + struct mutex cache_info_lock; + struct pcache_cache_info cache_info; + struct pcache_cache_info *cache_info_addr; + u32 info_index; + + u32 n_segs; + unsigned long *seg_map; + u32 last_cache_seg; + bool cache_full; + spinlock_t seg_map_lock; + struct pcache_cache_segment *segments; +}; + +struct workqueue_struct *cache_get_wq(struct pcache_cache *cache); + +struct dm_pcache; +struct pcache_cache_options { + u32 cache_mode:4; + u32 data_crc:1; +}; +int pcache_cache_start(struct dm_pcache *pcache); +void pcache_cache_stop(struct dm_pcache *pcache); + +struct pcache_cache_ctrl { + /* Updated by gc_thread */ + struct pcache_cache_pos_onmedia key_tail_pos[PCACHE_META_INDEX_MAX]; + + /* Updated by writeback_thread */ + struct pcache_cache_pos_onmedia dirty_tail_pos[PCACHE_META_INDEX_MAX]; +}; + +struct pcache_cache_data_head { + struct pcache_cache_pos head_pos; +}; + +static inline u16 pcache_cache_get_gc_percent(struct pcache_cache *cache) +{ + return FIELD_GET(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, cache->cache_info.flags); +} + +int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent); + +/* cache key */ +struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask); +void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key); +void cache_key_get(struct pcache_cache_key *key); +void cache_key_put(struct pcache_cache_key *key); +int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close); +void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup); +int cache_key_decode(struct pcache_cache *cache, + struct pcache_cache_key_onmedia *key_onmedia, + struct pcache_cache_key *key); +void cache_pos_advance(struct pcache_cache_pos *pos, u32 len); + +#define PCACHE_KSET_FLAGS_LAST BIT(0) +#define PCACHE_KSET_MAGIC 0x676894a64e164f1aULL + +struct pcache_cache_kset { + struct pcache_cache *cache; + spinlock_t kset_lock; + struct delayed_work flush_work; + struct pcache_cache_kset_onmedia kset_onmedia; +}; + +extern struct pcache_cache_kset_onmedia pcache_empty_kset; + +#define SUBTREE_WALK_RET_OK 0 +#define SUBTREE_WALK_RET_ERR 1 +#define SUBTREE_WALK_RET_NEED_KEY 2 +#define SUBTREE_WALK_RET_NEED_REQ 3 +#define SUBTREE_WALK_RET_RESEARCH 4 + +struct pcache_cache_subtree_walk_ctx { + struct pcache_cache_tree *cache_tree; + struct rb_node *start_node; + struct pcache_request *pcache_req; + struct pcache_cache_key *key; + u32 req_done; + int ret; + + /* pre-allocated key and backing_dev_req */ + struct pcache_cache_key *pre_alloc_key; + struct pcache_backing_dev_req *pre_alloc_req; + + struct list_head *delete_key_list; + struct list_head *submit_req_list; + + /* + * |--------| key_tmp + * |====| key + */ + int (*before)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |----------| key_tmp + * |=====| key + */ + int (*after)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |----------------| key_tmp + * |===========| key + */ + int (*overlap_tail)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |--------| key_tmp + * |==========| key + */ + int (*overlap_head)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |----| key_tmp + * |==========| key + */ + int (*overlap_contain)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |-----------| key_tmp + * |====| key + */ + int (*overlap_contained)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + int (*walk_finally)(struct pcache_cache_subtree_walk_ctx *ctx, int ret); + bool (*walk_done)(struct pcache_cache_subtree_walk_ctx *ctx); +}; + +int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx); +struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key, + struct rb_node **parentp, struct rb_node ***newp, + struct list_head *delete_key_list); +int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset); +void clean_fn(struct work_struct *work); +void kset_flush_fn(struct work_struct *work); +int cache_replay(struct pcache_cache *cache); +int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees); +void cache_tree_clear(struct pcache_cache_tree *cache_tree); +void cache_tree_exit(struct pcache_cache_tree *cache_tree); + +/* cache segments */ +struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache); +int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id, + bool new_cache); +void cache_seg_get(struct pcache_cache_segment *cache_seg); +void cache_seg_put(struct pcache_cache_segment *cache_seg); +void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id); + +/* cache request*/ +int pcache_cache_flush(struct pcache_cache *cache); +void miss_read_end_work_fn(struct work_struct *work); +int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req); + +/* gc */ +void pcache_cache_gc_fn(struct work_struct *work); + +/* writeback */ +void cache_writeback_exit(struct pcache_cache *cache); +int cache_writeback_init(struct pcache_cache *cache); +void cache_writeback_fn(struct work_struct *work); + +/* inline functions */ +static inline struct pcache_cache_subtree *get_subtree(struct pcache_cache_tree *cache_tree, u64 off) +{ + if (cache_tree->n_subtrees == 1) + return &cache_tree->subtrees[0]; + + return &cache_tree->subtrees[off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT]; +} + +static inline void *cache_pos_addr(struct pcache_cache_pos *pos) +{ + return (pos->cache_seg->segment.data + pos->seg_off); +} + +static inline void *get_key_head_addr(struct pcache_cache *cache) +{ + return cache_pos_addr(&cache->key_head); +} + +static inline u32 get_kset_id(struct pcache_cache *cache, u64 off) +{ + u32 kset_id; + + div_u64_rem(off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT, cache->n_ksets, &kset_id); + + return kset_id; +} + +static inline struct pcache_cache_kset *get_kset(struct pcache_cache *cache, u32 kset_id) +{ + return (void *)cache->ksets + PCACHE_KSET_SIZE * kset_id; +} + +static inline struct pcache_cache_data_head *get_data_head(struct pcache_cache *cache) +{ + return this_cpu_ptr(cache->data_heads); +} + +static inline bool cache_key_empty(struct pcache_cache_key *key) +{ + return key->flags & PCACHE_CACHE_KEY_FLAGS_EMPTY; +} + +static inline bool cache_key_clean(struct pcache_cache_key *key) +{ + return key->flags & PCACHE_CACHE_KEY_FLAGS_CLEAN; +} + +static inline void cache_pos_copy(struct pcache_cache_pos *dst, struct pcache_cache_pos *src) +{ + memcpy(dst, src, sizeof(struct pcache_cache_pos)); +} + +/** + * cache_seg_is_ctrl_seg - Checks if a cache segment is a cache ctrl segment. + * @cache_seg_id: ID of the cache segment. + * + * Returns true if the cache segment ID corresponds to a cache ctrl segment. + * + * Note: We extend the segment control of the first cache segment + * (cache segment ID 0) to serve as the cache control (pcache_cache_ctrl) + * for the entire PCACHE cache. This function determines whether the given + * cache segment is the one storing the pcache_cache_ctrl information. + */ +static inline bool cache_seg_is_ctrl_seg(u32 cache_seg_id) +{ + return (cache_seg_id == 0); +} + +/** + * cache_key_cutfront - Cuts a specified length from the front of a cache key. + * @key: Pointer to pcache_cache_key structure. + * @cut_len: Length to cut from the front. + * + * Advances the cache key position by cut_len and adjusts offset and length accordingly. + */ +static inline void cache_key_cutfront(struct pcache_cache_key *key, u32 cut_len) +{ + if (key->cache_pos.cache_seg) + cache_pos_advance(&key->cache_pos, cut_len); + + key->off += cut_len; + key->len -= cut_len; +} + +/** + * cache_key_cutback - Cuts a specified length from the back of a cache key. + * @key: Pointer to pcache_cache_key structure. + * @cut_len: Length to cut from the back. + * + * Reduces the length of the cache key by cut_len. + */ +static inline void cache_key_cutback(struct pcache_cache_key *key, u32 cut_len) +{ + key->len -= cut_len; +} + +static inline void cache_key_delete(struct pcache_cache_key *key) +{ + struct pcache_cache_subtree *cache_subtree; + + cache_subtree = key->cache_subtree; + BUG_ON(!cache_subtree); + + rb_erase(&key->rb_node, &cache_subtree->root); + key->flags = 0; + cache_key_put(key); +} + +static inline bool cache_data_crc_on(struct pcache_cache *cache) +{ + return (cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC); +} + +static inline u32 cache_mode_get(struct pcache_cache *cache) +{ + return FIELD_GET(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache->cache_info.flags); +} + +static inline void cache_mode_set(struct pcache_cache *cache, u32 cache_mode) +{ + cache->cache_info.flags &= ~PCACHE_CACHE_FLAGS_CACHE_MODE_MASK; + cache->cache_info.flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache_mode); +} + +/** + * cache_key_data_crc - Calculates CRC for data in a cache key. + * @key: Pointer to the pcache_cache_key structure. + * + * Returns the CRC-32 checksum of the data within the cache key's position. + */ +static inline u32 cache_key_data_crc(struct pcache_cache_key *key) +{ + void *data; + + data = cache_pos_addr(&key->cache_pos); + + return crc32c(PCACHE_CRC_SEED, data, key->len); +} + +static inline u32 cache_kset_crc(struct pcache_cache_kset_onmedia *kset_onmedia) +{ + u32 crc_size; + + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) + crc_size = sizeof(struct pcache_cache_kset_onmedia) - 4; + else + crc_size = struct_size(kset_onmedia, data, kset_onmedia->key_num) - 4; + + return crc32c(PCACHE_CRC_SEED, (void *)kset_onmedia + 4, crc_size); +} + +static inline u32 get_kset_onmedia_size(struct pcache_cache_kset_onmedia *kset_onmedia) +{ + return struct_size_t(struct pcache_cache_kset_onmedia, data, kset_onmedia->key_num); +} + +/** + * cache_seg_remain - Computes remaining space in a cache segment. + * @pos: Pointer to pcache_cache_pos structure. + * + * Returns the amount of remaining space in the segment data starting from + * the current position offset. + */ +static inline u32 cache_seg_remain(struct pcache_cache_pos *pos) +{ + struct pcache_cache_segment *cache_seg; + struct pcache_segment *segment; + u32 seg_remain; + + cache_seg = pos->cache_seg; + segment = &cache_seg->segment; + seg_remain = segment->data_size - pos->seg_off; + + return seg_remain; +} + +/** + * cache_key_invalid - Checks if a cache key is invalid. + * @key: Pointer to pcache_cache_key structure. + * + * Returns true if the cache key is invalid due to its generation being + * less than the generation of its segment; otherwise returns false. + * + * When the GC (garbage collection) thread identifies a segment + * as reclaimable, it increments the segment's generation (gen). However, + * it does not immediately remove all related cache keys. When accessing + * such a cache key, this function can be used to determine if the cache + * key has already become invalid. + */ +static inline bool cache_key_invalid(struct pcache_cache_key *key) +{ + if (cache_key_empty(key)) + return false; + + return (key->seg_gen < key->cache_pos.cache_seg->gen); +} + +/** + * cache_key_lstart - Retrieves the logical start offset of a cache key. + * @key: Pointer to pcache_cache_key structure. + * + * Returns the logical start offset for the cache key. + */ +static inline u64 cache_key_lstart(struct pcache_cache_key *key) +{ + return key->off; +} + +/** + * cache_key_lend - Retrieves the logical end offset of a cache key. + * @key: Pointer to pcache_cache_key structure. + * + * Returns the logical end offset for the cache key. + */ +static inline u64 cache_key_lend(struct pcache_cache_key *key) +{ + return key->off + key->len; +} + +static inline void cache_key_copy(struct pcache_cache_key *key_dst, struct pcache_cache_key *key_src) +{ + key_dst->off = key_src->off; + key_dst->len = key_src->len; + key_dst->seg_gen = key_src->seg_gen; + key_dst->cache_tree = key_src->cache_tree; + key_dst->cache_subtree = key_src->cache_subtree; + key_dst->flags = key_src->flags; + + cache_pos_copy(&key_dst->cache_pos, &key_src->cache_pos); +} + +/** + * cache_pos_onmedia_crc - Calculates the CRC for an on-media cache position. + * @pos_om: Pointer to pcache_cache_pos_onmedia structure. + * + * Calculates the CRC-32 checksum of the position, excluding the first 4 bytes. + * Returns the computed CRC value. + */ +static inline u32 cache_pos_onmedia_crc(struct pcache_cache_pos_onmedia *pos_om) +{ + return pcache_meta_crc(&pos_om->header, sizeof(struct pcache_cache_pos_onmedia)); +} + +void cache_pos_encode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia, + struct pcache_cache_pos *pos, u64 seq, u32 *index); +int cache_pos_decode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia, + struct pcache_cache_pos *pos, u64 *seq, u32 *index); + +static inline void cache_encode_key_tail(struct pcache_cache *cache) +{ + cache_pos_encode(cache, cache->cache_ctrl->key_tail_pos, + &cache->key_tail, ++cache->key_tail_seq, + &cache->key_tail_index); +} + +static inline int cache_decode_key_tail(struct pcache_cache *cache) +{ + return cache_pos_decode(cache, cache->cache_ctrl->key_tail_pos, + &cache->key_tail, &cache->key_tail_seq, + &cache->key_tail_index); +} + +static inline void cache_encode_dirty_tail(struct pcache_cache *cache) +{ + cache_pos_encode(cache, cache->cache_ctrl->dirty_tail_pos, + &cache->dirty_tail, ++cache->dirty_tail_seq, + &cache->dirty_tail_index); +} + +static inline int cache_decode_dirty_tail(struct pcache_cache *cache) +{ + return cache_pos_decode(cache, cache->cache_ctrl->dirty_tail_pos, + &cache->dirty_tail, &cache->dirty_tail_seq, + &cache->dirty_tail_index); +} + +int pcache_cache_init(void); +void pcache_cache_exit(void); +#endif /* _PCACHE_CACHE_H */ diff --git a/drivers/md/dm-pcache/cache_dev.c b/drivers/md/dm-pcache/cache_dev.c new file mode 100644 index 000000000000..ece689e6ce59 --- /dev/null +++ b/drivers/md/dm-pcache/cache_dev.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/blkdev.h> +#include <linux/dax.h> +#include <linux/vmalloc.h> +#include <linux/parser.h> + +#include "cache_dev.h" +#include "backing_dev.h" +#include "cache.h" +#include "dm_pcache.h" + +static void cache_dev_dax_exit(struct pcache_cache_dev *cache_dev) +{ + if (cache_dev->use_vmap) + vunmap(cache_dev->mapping); +} + +static int build_vmap(struct dax_device *dax_dev, long total_pages, void **vaddr) +{ + struct page **pages; + long i = 0, chunk; + unsigned long pfn; + int ret; + + pages = vmalloc_array(total_pages, sizeof(struct page *)); + if (!pages) + return -ENOMEM; + + do { + chunk = dax_direct_access(dax_dev, i, total_pages - i, + DAX_ACCESS, NULL, &pfn); + if (chunk <= 0) { + ret = chunk ? chunk : -EINVAL; + goto out_free; + } + + if (!pfn_valid(pfn)) { + ret = -EOPNOTSUPP; + goto out_free; + } + + while (chunk-- && i < total_pages) { + pages[i++] = pfn_to_page(pfn); + pfn++; + if (!(i & 15)) + cond_resched(); + } + } while (i < total_pages); + + *vaddr = vmap(pages, total_pages, VM_MAP, PAGE_KERNEL); + if (!*vaddr) { + ret = -ENOMEM; + goto out_free; + } + + ret = 0; + +out_free: + vfree(pages); + return ret; +} + +static int cache_dev_dax_init(struct pcache_cache_dev *cache_dev) +{ + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + struct dax_device *dax_dev; + long total_pages, mapped_pages; + u64 bdev_size; + void *vaddr; + int ret; + int id; + unsigned long pfn; + + dax_dev = cache_dev->dm_dev->dax_dev; + /* total size check */ + bdev_size = bdev_nr_bytes(cache_dev->dm_dev->bdev); + if (bdev_size < PCACHE_CACHE_DEV_SIZE_MIN) { + pcache_dev_err(pcache, "dax device is too small, required at least %llu", + PCACHE_CACHE_DEV_SIZE_MIN); + ret = -ENOSPC; + goto out; + } + + total_pages = bdev_size >> PAGE_SHIFT; + /* attempt: direct-map the whole range */ + id = dax_read_lock(); + mapped_pages = dax_direct_access(dax_dev, 0, total_pages, + DAX_ACCESS, &vaddr, &pfn); + if (mapped_pages < 0) { + pcache_dev_err(pcache, "dax_direct_access failed: %ld\n", mapped_pages); + ret = mapped_pages; + goto unlock; + } + + if (!pfn_valid(pfn)) { + ret = -EOPNOTSUPP; + goto unlock; + } + + if (mapped_pages == total_pages) { + /* success: contiguous direct mapping */ + cache_dev->mapping = vaddr; + } else { + /* need vmap fallback */ + ret = build_vmap(dax_dev, total_pages, &vaddr); + if (ret) { + pcache_dev_err(pcache, "vmap fallback failed: %d\n", ret); + goto unlock; + } + + cache_dev->mapping = vaddr; + cache_dev->use_vmap = true; + } + dax_read_unlock(id); + + return 0; +unlock: + dax_read_unlock(id); +out: + return ret; +} + +void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size) +{ + memset(pos, 0, size); + dax_flush(cache_dev->dm_dev->dax_dev, pos, size); +} + +static int sb_read(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev); + + if (copy_mc_to_kernel(sb, sb_addr, sizeof(struct pcache_sb))) + return -EIO; + + return 0; +} + +static void sb_write(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev); + + memcpy_flushcache(sb_addr, sb, sizeof(struct pcache_sb)); + pmem_wmb(); +} + +static int sb_init(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + u64 nr_segs; + u64 cache_dev_size; + u64 magic; + u32 flags = 0; + + magic = le64_to_cpu(sb->magic); + if (magic) + return -EEXIST; + + cache_dev_size = bdev_nr_bytes(file_bdev(cache_dev->dm_dev->bdev_file)); + if (cache_dev_size < PCACHE_CACHE_DEV_SIZE_MIN) { + pcache_dev_err(pcache, "dax device is too small, required at least %llu", + PCACHE_CACHE_DEV_SIZE_MIN); + return -ENOSPC; + } + + nr_segs = (cache_dev_size - PCACHE_SEGMENTS_OFF) / ((PCACHE_SEG_SIZE)); + +#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN) + flags |= PCACHE_SB_F_BIGENDIAN; +#endif + sb->flags = cpu_to_le32(flags); + sb->magic = cpu_to_le64(PCACHE_MAGIC); + sb->seg_num = cpu_to_le32(nr_segs); + sb->crc = cpu_to_le32(crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4)); + + cache_dev_zero_range(cache_dev, CACHE_DEV_CACHE_INFO(cache_dev), + PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX + + PCACHE_CACHE_CTRL_SIZE); + + return 0; +} + +static int sb_validate(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + u32 flags; + u32 crc; + + if (le64_to_cpu(sb->magic) != PCACHE_MAGIC) { + pcache_dev_err(pcache, "unexpected magic: %llx\n", + le64_to_cpu(sb->magic)); + return -EINVAL; + } + + crc = crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4); + if (crc != le32_to_cpu(sb->crc)) { + pcache_dev_err(pcache, "corrupted sb: %u, expected: %u\n", crc, le32_to_cpu(sb->crc)); + return -EINVAL; + } + + flags = le32_to_cpu(sb->flags); +#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN) + if (!(flags & PCACHE_SB_F_BIGENDIAN)) { + pcache_dev_err(pcache, "cache_dev is not big endian\n"); + return -EINVAL; + } +#else + if (flags & PCACHE_SB_F_BIGENDIAN) { + pcache_dev_err(pcache, "cache_dev is big endian\n"); + return -EINVAL; + } +#endif + return 0; +} + +static int cache_dev_init(struct pcache_cache_dev *cache_dev, u32 seg_num) +{ + cache_dev->seg_num = seg_num; + cache_dev->seg_bitmap = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL); + if (!cache_dev->seg_bitmap) + return -ENOMEM; + + return 0; +} + +static void cache_dev_exit(struct pcache_cache_dev *cache_dev) +{ + kvfree(cache_dev->seg_bitmap); +} + +void cache_dev_stop(struct dm_pcache *pcache) +{ + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + + cache_dev_exit(cache_dev); + cache_dev_dax_exit(cache_dev); +} + +int cache_dev_start(struct dm_pcache *pcache) +{ + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + struct pcache_sb sb; + bool format = false; + int ret; + + mutex_init(&cache_dev->seg_lock); + + ret = cache_dev_dax_init(cache_dev); + if (ret) { + pcache_dev_err(pcache, "failed to init cache_dev %s via dax way: %d.", + cache_dev->dm_dev->name, ret); + goto err; + } + + ret = sb_read(cache_dev, &sb); + if (ret) + goto dax_release; + + if (le64_to_cpu(sb.magic) == 0) { + format = true; + ret = sb_init(cache_dev, &sb); + if (ret < 0) + goto dax_release; + } + + ret = sb_validate(cache_dev, &sb); + if (ret) + goto dax_release; + + cache_dev->sb_flags = le32_to_cpu(sb.flags); + ret = cache_dev_init(cache_dev, le32_to_cpu(sb.seg_num)); + if (ret) + goto dax_release; + + if (format) + sb_write(cache_dev, &sb); + + return 0; + +dax_release: + cache_dev_dax_exit(cache_dev); +err: + return ret; +} + +int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id) +{ + int ret; + + mutex_lock(&cache_dev->seg_lock); + *seg_id = find_next_zero_bit(cache_dev->seg_bitmap, cache_dev->seg_num, 0); + if (*seg_id == cache_dev->seg_num) { + ret = -ENOSPC; + goto unlock; + } + + __set_bit(*seg_id, cache_dev->seg_bitmap); + ret = 0; +unlock: + mutex_unlock(&cache_dev->seg_lock); + return ret; +} diff --git a/drivers/md/dm-pcache/cache_dev.h b/drivers/md/dm-pcache/cache_dev.h new file mode 100644 index 000000000000..6251eb4ebe96 --- /dev/null +++ b/drivers/md/dm-pcache/cache_dev.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_CACHE_DEV_H +#define _PCACHE_CACHE_DEV_H + +#include <linux/device.h> +#include <linux/device-mapper.h> + +#include "pcache_internal.h" + +#define PCACHE_MAGIC 0x65B05EFA96C596EFULL + +#define PCACHE_SB_OFF (4 * PCACHE_KB) +#define PCACHE_SB_SIZE (4 * PCACHE_KB) + +#define PCACHE_CACHE_INFO_OFF (PCACHE_SB_OFF + PCACHE_SB_SIZE) +#define PCACHE_CACHE_INFO_SIZE (4 * PCACHE_KB) + +#define PCACHE_CACHE_CTRL_OFF (PCACHE_CACHE_INFO_OFF + (PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX)) +#define PCACHE_CACHE_CTRL_SIZE (4 * PCACHE_KB) + +#define PCACHE_SEGMENTS_OFF (PCACHE_CACHE_CTRL_OFF + PCACHE_CACHE_CTRL_SIZE) +#define PCACHE_SEG_INFO_SIZE (4 * PCACHE_KB) + +#define PCACHE_CACHE_DEV_SIZE_MIN (512 * PCACHE_MB) /* 512 MB */ +#define PCACHE_SEG_SIZE (16 * PCACHE_MB) /* Size of each PCACHE segment (16 MB) */ + +#define CACHE_DEV_SB(cache_dev) ((struct pcache_sb *)(cache_dev->mapping + PCACHE_SB_OFF)) +#define CACHE_DEV_CACHE_INFO(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_INFO_OFF) +#define CACHE_DEV_CACHE_CTRL(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_CTRL_OFF) +#define CACHE_DEV_SEGMENTS(cache_dev) ((void *)cache_dev->mapping + PCACHE_SEGMENTS_OFF) +#define CACHE_DEV_SEGMENT(cache_dev, id) ((void *)CACHE_DEV_SEGMENTS(cache_dev) + (u64)id * PCACHE_SEG_SIZE) + +/* + * PCACHE SB flags configured during formatting + * + * The PCACHE_SB_F_xxx flags define registration requirements based on cache_dev + * formatting. For a machine to register a cache_dev: + * - PCACHE_SB_F_BIGENDIAN: Requires a big-endian machine. + */ +#define PCACHE_SB_F_BIGENDIAN BIT(0) + +struct pcache_sb { + __le32 crc; + __le32 flags; + __le64 magic; + + __le32 seg_num; +}; + +struct pcache_cache_dev { + u32 sb_flags; + u32 seg_num; + void *mapping; + bool use_vmap; + + struct dm_dev *dm_dev; + + struct mutex seg_lock; + unsigned long *seg_bitmap; +}; + +struct dm_pcache; +int cache_dev_start(struct dm_pcache *pcache); +void cache_dev_stop(struct dm_pcache *pcache); + +void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size); + +int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id); + +#endif /* _PCACHE_CACHE_DEV_H */ diff --git a/drivers/md/dm-pcache/cache_gc.c b/drivers/md/dm-pcache/cache_gc.c new file mode 100644 index 000000000000..94f8b276a021 --- /dev/null +++ b/drivers/md/dm-pcache/cache_gc.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +/** + * cache_key_gc - Releases the reference of a cache key segment. + * @cache: Pointer to the pcache_cache structure. + * @key: Pointer to the cache key to be garbage collected. + * + * This function decrements the reference count of the cache segment + * associated with the given key. If the reference count drops to zero, + * the segment may be invalidated and reused. + */ +static void cache_key_gc(struct pcache_cache *cache, struct pcache_cache_key *key) +{ + cache_seg_put(key->cache_pos.cache_seg); +} + +static bool need_gc(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail, struct pcache_cache_pos *key_tail) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_kset_onmedia *kset_onmedia; + void *dirty_addr, *key_addr; + u32 segs_used, segs_gc_threshold, to_copy; + int ret; + + dirty_addr = cache_pos_addr(dirty_tail); + key_addr = cache_pos_addr(key_tail); + if (dirty_addr == key_addr) { + pcache_dev_debug(pcache, "key tail is equal to dirty tail: %u:%u\n", + dirty_tail->cache_seg->cache_seg_id, + dirty_tail->seg_off); + return false; + } + + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf; + + to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - key_tail->seg_off); + ret = copy_mc_to_kernel(kset_onmedia, key_addr, to_copy); + if (ret) { + pcache_dev_err(pcache, "error to read kset: %d", ret); + return false; + } + + /* Check if kset_onmedia is corrupted */ + if (kset_onmedia->magic != PCACHE_KSET_MAGIC) { + pcache_dev_debug(pcache, "gc error: magic is not as expected. key_tail: %u:%u magic: %llx, expected: %llx\n", + key_tail->cache_seg->cache_seg_id, key_tail->seg_off, + kset_onmedia->magic, PCACHE_KSET_MAGIC); + return false; + } + + /* Verify the CRC of the kset_onmedia */ + if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) { + pcache_dev_debug(pcache, "gc error: crc is not as expected. crc: %x, expected: %x\n", + cache_kset_crc(kset_onmedia), kset_onmedia->crc); + return false; + } + + segs_used = bitmap_weight(cache->seg_map, cache->n_segs); + segs_gc_threshold = cache->n_segs * pcache_cache_get_gc_percent(cache) / 100; + if (segs_used < segs_gc_threshold) { + pcache_dev_debug(pcache, "segs_used: %u, segs_gc_threshold: %u\n", segs_used, segs_gc_threshold); + return false; + } + + return true; +} + +/** + * last_kset_gc - Advances the garbage collection for the last kset. + * @cache: Pointer to the pcache_cache structure. + * @kset_onmedia: Pointer to the kset_onmedia structure for the last kset. + */ +static void last_kset_gc(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_segment *cur_seg, *next_seg; + + cur_seg = cache->key_tail.cache_seg; + + next_seg = &cache->segments[kset_onmedia->next_cache_seg_id]; + + mutex_lock(&cache->key_tail_lock); + cache->key_tail.cache_seg = next_seg; + cache->key_tail.seg_off = 0; + cache_encode_key_tail(cache); + mutex_unlock(&cache->key_tail_lock); + + pcache_dev_debug(pcache, "gc advance kset seg: %u\n", cur_seg->cache_seg_id); + + spin_lock(&cache->seg_map_lock); + __clear_bit(cur_seg->cache_seg_id, cache->seg_map); + spin_unlock(&cache->seg_map_lock); +} + +void pcache_cache_gc_fn(struct work_struct *work) +{ + struct pcache_cache *cache = container_of(work, struct pcache_cache, gc_work.work); + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_pos dirty_tail, key_tail; + struct pcache_cache_kset_onmedia *kset_onmedia; + struct pcache_cache_key_onmedia *key_onmedia; + struct pcache_cache_key *key; + int ret; + int i; + + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf; + + while (true) { + if (pcache_is_stopping(pcache) || atomic_read(&cache->gc_errors)) + return; + + /* Get new tail positions */ + mutex_lock(&cache->dirty_tail_lock); + cache_pos_copy(&dirty_tail, &cache->dirty_tail); + mutex_unlock(&cache->dirty_tail_lock); + + mutex_lock(&cache->key_tail_lock); + cache_pos_copy(&key_tail, &cache->key_tail); + mutex_unlock(&cache->key_tail_lock); + + if (!need_gc(cache, &dirty_tail, &key_tail)) + break; + + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) { + /* Don't move to the next segment if dirty_tail has not moved */ + if (dirty_tail.cache_seg == key_tail.cache_seg) + break; + + last_kset_gc(cache, kset_onmedia); + continue; + } + + for (i = 0; i < kset_onmedia->key_num; i++) { + struct pcache_cache_key key_tmp = { 0 }; + + key_onmedia = &kset_onmedia->data[i]; + + key = &key_tmp; + cache_key_init(&cache->req_key_tree, key); + + ret = cache_key_decode(cache, key_onmedia, key); + if (ret) { + /* return without re-arm gc work, and prevent future + * gc, because we can't retry the partial-gc-ed kset + */ + atomic_inc(&cache->gc_errors); + pcache_dev_err(pcache, "failed to decode cache key in gc\n"); + return; + } + + cache_key_gc(cache, key); + } + + pcache_dev_debug(pcache, "gc advance: %u:%u %u\n", + key_tail.cache_seg->cache_seg_id, + key_tail.seg_off, + get_kset_onmedia_size(kset_onmedia)); + + mutex_lock(&cache->key_tail_lock); + cache_pos_advance(&cache->key_tail, get_kset_onmedia_size(kset_onmedia)); + cache_encode_key_tail(cache); + mutex_unlock(&cache->key_tail_lock); + } + + queue_delayed_work(cache_get_wq(cache), &cache->gc_work, PCACHE_CACHE_GC_INTERVAL); +} diff --git a/drivers/md/dm-pcache/cache_key.c b/drivers/md/dm-pcache/cache_key.c new file mode 100644 index 000000000000..2b77e121f89b --- /dev/null +++ b/drivers/md/dm-pcache/cache_key.c @@ -0,0 +1,888 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +struct pcache_cache_kset_onmedia pcache_empty_kset = { 0 }; + +void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key) +{ + kref_init(&key->ref); + key->cache_tree = cache_tree; + INIT_LIST_HEAD(&key->list_node); + RB_CLEAR_NODE(&key->rb_node); +} + +struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask) +{ + struct pcache_cache_key *key; + + key = mempool_alloc(&cache_tree->key_pool, gfp_mask); + if (!key) + return NULL; + + memset(key, 0, sizeof(struct pcache_cache_key)); + cache_key_init(cache_tree, key); + + return key; +} + +/** + * cache_key_get - Increment the reference count of a cache key. + * @key: Pointer to the pcache_cache_key structure. + * + * This function increments the reference count of the specified cache key, + * ensuring that it is not freed while still in use. + */ +void cache_key_get(struct pcache_cache_key *key) +{ + kref_get(&key->ref); +} + +/** + * cache_key_destroy - Free a cache key structure when its reference count drops to zero. + * @ref: Pointer to the kref structure. + * + * This function is called when the reference count of the cache key reaches zero. + * It frees the allocated cache key back to the slab cache. + */ +static void cache_key_destroy(struct kref *ref) +{ + struct pcache_cache_key *key = container_of(ref, struct pcache_cache_key, ref); + struct pcache_cache_tree *cache_tree = key->cache_tree; + + mempool_free(key, &cache_tree->key_pool); +} + +void cache_key_put(struct pcache_cache_key *key) +{ + kref_put(&key->ref, cache_key_destroy); +} + +void cache_pos_advance(struct pcache_cache_pos *pos, u32 len) +{ + /* Ensure enough space remains in the current segment */ + BUG_ON(cache_seg_remain(pos) < len); + + pos->seg_off += len; +} + +static void cache_key_encode(struct pcache_cache *cache, + struct pcache_cache_key_onmedia *key_onmedia, + struct pcache_cache_key *key) +{ + key_onmedia->off = key->off; + key_onmedia->len = key->len; + + key_onmedia->cache_seg_id = key->cache_pos.cache_seg->cache_seg_id; + key_onmedia->cache_seg_off = key->cache_pos.seg_off; + + key_onmedia->seg_gen = key->seg_gen; + key_onmedia->flags = key->flags; + + if (cache_data_crc_on(cache)) + key_onmedia->data_crc = cache_key_data_crc(key); +} + +int cache_key_decode(struct pcache_cache *cache, + struct pcache_cache_key_onmedia *key_onmedia, + struct pcache_cache_key *key) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + + key->off = key_onmedia->off; + key->len = key_onmedia->len; + + key->cache_pos.cache_seg = &cache->segments[key_onmedia->cache_seg_id]; + key->cache_pos.seg_off = key_onmedia->cache_seg_off; + + key->seg_gen = key_onmedia->seg_gen; + key->flags = key_onmedia->flags; + + if (cache_data_crc_on(cache) && + key_onmedia->data_crc != cache_key_data_crc(key)) { + pcache_dev_err(pcache, "key: %llu:%u seg %u:%u data_crc error: %x, expected: %x\n", + key->off, key->len, key->cache_pos.cache_seg->cache_seg_id, + key->cache_pos.seg_off, cache_key_data_crc(key), key_onmedia->data_crc); + return -EIO; + } + + return 0; +} + +static void append_last_kset(struct pcache_cache *cache, u32 next_seg) +{ + struct pcache_cache_kset_onmedia kset_onmedia = { 0 }; + + kset_onmedia.flags |= PCACHE_KSET_FLAGS_LAST; + kset_onmedia.next_cache_seg_id = next_seg; + kset_onmedia.magic = PCACHE_KSET_MAGIC; + kset_onmedia.crc = cache_kset_crc(&kset_onmedia); + + memcpy_flushcache(get_key_head_addr(cache), &kset_onmedia, sizeof(struct pcache_cache_kset_onmedia)); + pmem_wmb(); + cache_pos_advance(&cache->key_head, sizeof(struct pcache_cache_kset_onmedia)); +} + +int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset) +{ + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 kset_onmedia_size; + int ret; + + kset_onmedia = &kset->kset_onmedia; + + if (!kset_onmedia->key_num) + return 0; + + kset_onmedia_size = struct_size(kset_onmedia, data, kset_onmedia->key_num); + + spin_lock(&cache->key_head_lock); +again: + /* Reserve space for the last kset */ + if (cache_seg_remain(&cache->key_head) < kset_onmedia_size + sizeof(struct pcache_cache_kset_onmedia)) { + struct pcache_cache_segment *next_seg; + + next_seg = get_cache_segment(cache); + if (!next_seg) { + ret = -EBUSY; + goto out; + } + + /* clear outdated kset in next seg */ + memcpy_flushcache(next_seg->segment.data, &pcache_empty_kset, + sizeof(struct pcache_cache_kset_onmedia)); + append_last_kset(cache, next_seg->cache_seg_id); + cache->key_head.cache_seg = next_seg; + cache->key_head.seg_off = 0; + goto again; + } + + kset_onmedia->magic = PCACHE_KSET_MAGIC; + kset_onmedia->crc = cache_kset_crc(kset_onmedia); + + /* clear outdated kset after current kset */ + memcpy_flushcache(get_key_head_addr(cache) + kset_onmedia_size, &pcache_empty_kset, + sizeof(struct pcache_cache_kset_onmedia)); + /* write current kset into segment */ + memcpy_flushcache(get_key_head_addr(cache), kset_onmedia, kset_onmedia_size); + pmem_wmb(); + + /* reset kset_onmedia */ + memset(kset_onmedia, 0, sizeof(struct pcache_cache_kset_onmedia)); + cache_pos_advance(&cache->key_head, kset_onmedia_size); + + ret = 0; +out: + spin_unlock(&cache->key_head_lock); + + return ret; +} + +/** + * cache_key_append - Append a cache key to the related kset. + * @cache: Pointer to the pcache_cache structure. + * @key: Pointer to the cache key structure to append. + * @force_close: Need to close current kset if true. + * + * This function appends a cache key to the appropriate kset. If the kset + * is full, it closes the kset. If not, it queues a flush work to write + * the kset to media. + * + * Returns 0 on success, or a negative error code on failure. + */ +int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close) +{ + struct pcache_cache_kset *kset; + struct pcache_cache_kset_onmedia *kset_onmedia; + struct pcache_cache_key_onmedia *key_onmedia; + u32 kset_id = get_kset_id(cache, key->off); + int ret = 0; + + kset = get_kset(cache, kset_id); + kset_onmedia = &kset->kset_onmedia; + + spin_lock(&kset->kset_lock); + key_onmedia = &kset_onmedia->data[kset_onmedia->key_num]; + cache_key_encode(cache, key_onmedia, key); + + /* Check if the current kset has reached the maximum number of keys */ + if (++kset_onmedia->key_num == PCACHE_KSET_KEYS_MAX || force_close) { + /* If full, close the kset */ + ret = cache_kset_close(cache, kset); + if (ret) { + kset_onmedia->key_num--; + goto out; + } + } else { + /* If not full, queue a delayed work to flush the kset */ + queue_delayed_work(cache_get_wq(cache), &kset->flush_work, 1 * HZ); + } +out: + spin_unlock(&kset->kset_lock); + + return ret; +} + +/** + * cache_subtree_walk - Traverse the cache tree. + * @ctx: Pointer to the context structure for traversal. + * + * This function traverses the cache tree starting from the specified node. + * It calls the appropriate callback functions based on the relationships + * between the keys in the cache tree. + * + * Returns 0 on success, or a negative error code on failure. + */ +int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache_key *key_tmp, *key; + struct rb_node *node_tmp; + int ret = SUBTREE_WALK_RET_OK; + + key = ctx->key; + node_tmp = ctx->start_node; + + while (node_tmp) { + if (ctx->walk_done && ctx->walk_done(ctx)) + break; + + key_tmp = CACHE_KEY(node_tmp); + /* + * If key_tmp ends before the start of key, continue to the next node. + * |----------| + * |=====| + */ + if (cache_key_lend(key_tmp) <= cache_key_lstart(key)) { + if (ctx->after) { + ret = ctx->after(key, key_tmp, ctx); + if (ret) + goto out; + } + goto next; + } + + /* + * If key_tmp starts after the end of key, stop traversing. + * |--------| + * |====| + */ + if (cache_key_lstart(key_tmp) >= cache_key_lend(key)) { + if (ctx->before) { + ret = ctx->before(key, key_tmp, ctx); + if (ret) + goto out; + } + break; + } + + /* Handle overlapping keys */ + if (cache_key_lstart(key_tmp) >= cache_key_lstart(key)) { + /* + * If key_tmp encompasses key. + * |----------------| key_tmp + * |===========| key + */ + if (cache_key_lend(key_tmp) >= cache_key_lend(key)) { + if (ctx->overlap_tail) { + ret = ctx->overlap_tail(key, key_tmp, ctx); + if (ret) + goto out; + } + break; + } + + /* + * If key_tmp is contained within key. + * |----| key_tmp + * |==========| key + */ + if (ctx->overlap_contain) { + ret = ctx->overlap_contain(key, key_tmp, ctx); + if (ret) + goto out; + } + + goto next; + } + + /* + * If key_tmp starts before key ends but ends after key. + * |-----------| key_tmp + * |====| key + */ + if (cache_key_lend(key_tmp) > cache_key_lend(key)) { + if (ctx->overlap_contained) { + ret = ctx->overlap_contained(key, key_tmp, ctx); + if (ret) + goto out; + } + break; + } + + /* + * If key_tmp starts before key and ends within key. + * |--------| key_tmp + * |==========| key + */ + if (ctx->overlap_head) { + ret = ctx->overlap_head(key, key_tmp, ctx); + if (ret) + goto out; + } +next: + node_tmp = rb_next(node_tmp); + } + +out: + if (ctx->walk_finally) + ret = ctx->walk_finally(ctx, ret); + + return ret; +} + +/** + * cache_subtree_search - Search for a key in the cache tree. + * @cache_subtree: Pointer to the cache tree structure. + * @key: Pointer to the cache key to search for. + * @parentp: Pointer to store the parent node of the found node. + * @newp: Pointer to store the location where the new node should be inserted. + * @delete_key_list: List to collect invalid keys for deletion. + * + * This function searches the cache tree for a specific key and returns + * the node that is the predecessor of the key, or first node if the key is + * less than all keys in the tree. If any invalid keys are found during + * the search, they are added to the delete_key_list for later cleanup. + * + * Returns a pointer to the previous node. + */ +struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key, + struct rb_node **parentp, struct rb_node ***newp, + struct list_head *delete_key_list) +{ + struct rb_node **new, *parent = NULL; + struct pcache_cache_key *key_tmp; + struct rb_node *prev_node = NULL; + + new = &(cache_subtree->root.rb_node); + while (*new) { + key_tmp = container_of(*new, struct pcache_cache_key, rb_node); + if (cache_key_invalid(key_tmp)) + list_add(&key_tmp->list_node, delete_key_list); + + parent = *new; + if (key_tmp->off >= key->off) { + new = &((*new)->rb_left); + } else { + prev_node = *new; + new = &((*new)->rb_right); + } + } + + if (!prev_node) + prev_node = rb_first(&cache_subtree->root); + + if (parentp) + *parentp = parent; + + if (newp) + *newp = new; + + return prev_node; +} + +static struct pcache_cache_key *get_pre_alloc_key(struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache_key *key; + + if (ctx->pre_alloc_key) { + key = ctx->pre_alloc_key; + ctx->pre_alloc_key = NULL; + + return key; + } + + return cache_key_alloc(ctx->cache_tree, GFP_NOWAIT); +} + +/** + * fixup_overlap_tail - Adjust the key when it overlaps at the tail. + * @key: Pointer to the new cache key being inserted. + * @key_tmp: Pointer to the existing key that overlaps. + * @ctx: Pointer to the context for walking the cache tree. + * + * This function modifies the existing key (key_tmp) when there is an + * overlap at the tail with the new key. If the modified key becomes + * empty, it is deleted. + */ +static int fixup_overlap_tail(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + /* + * |----------------| key_tmp + * |===========| key + */ + BUG_ON(cache_key_empty(key)); + if (cache_key_empty(key_tmp)) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + cache_key_cutfront(key_tmp, cache_key_lend(key) - cache_key_lstart(key_tmp)); + if (key_tmp->len == 0) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + return SUBTREE_WALK_RET_OK; +} + +/** + * fixup_overlap_contain - Handle case where new key completely contains an existing key. + * @key: Pointer to the new cache key being inserted. + * @key_tmp: Pointer to the existing key that is being contained. + * @ctx: Pointer to the context for walking the cache tree. + * + * This function deletes the existing key (key_tmp) when the new key + * completely contains it. It returns SUBTREE_WALK_RET_RESEARCH to indicate that the + * tree structure may have changed, necessitating a re-insertion of + * the new key. + */ +static int fixup_overlap_contain(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + /* + * |----| key_tmp + * |==========| key + */ + BUG_ON(cache_key_empty(key)); + cache_key_delete(key_tmp); + + return SUBTREE_WALK_RET_RESEARCH; +} + +/** + * fixup_overlap_contained - Handle overlap when a new key is contained in an existing key. + * @key: The new cache key being inserted. + * @key_tmp: The existing cache key that overlaps with the new key. + * @ctx: Context for the cache tree walk. + * + * This function adjusts the existing key if the new key is contained + * within it. If the existing key is empty, it indicates a placeholder key + * that was inserted during a miss read. This placeholder will later be + * updated with real data from the backing_dev, making it no longer an empty key. + * + * If we delete key or insert a key, the structure of the entire cache tree may change, + * requiring a full research of the tree to find a new insertion point. + */ +static int fixup_overlap_contained(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache_tree *cache_tree = ctx->cache_tree; + + /* + * |-----------| key_tmp + * |====| key + */ + BUG_ON(cache_key_empty(key)); + if (cache_key_empty(key_tmp)) { + /* If key_tmp is empty, don't split it; + * it's a placeholder key for miss reads that will be updated later. + */ + cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key)); + if (key_tmp->len == 0) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + } else { + struct pcache_cache_key *key_fixup; + bool need_research = false; + + key_fixup = get_pre_alloc_key(ctx); + if (!key_fixup) + return SUBTREE_WALK_RET_NEED_KEY; + + cache_key_copy(key_fixup, key_tmp); + + /* Split key_tmp based on the new key's range */ + cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key)); + if (key_tmp->len == 0) { + cache_key_delete(key_tmp); + need_research = true; + } + + /* Create a new portion for key_fixup */ + cache_key_cutfront(key_fixup, cache_key_lend(key) - cache_key_lstart(key_tmp)); + if (key_fixup->len == 0) { + cache_key_put(key_fixup); + } else { + /* Insert the new key into the cache */ + cache_key_insert(cache_tree, key_fixup, false); + need_research = true; + } + + if (need_research) + return SUBTREE_WALK_RET_RESEARCH; + } + + return SUBTREE_WALK_RET_OK; +} + +/** + * fixup_overlap_head - Handle overlap when a new key overlaps with the head of an existing key. + * @key: The new cache key being inserted. + * @key_tmp: The existing cache key that overlaps with the new key. + * @ctx: Context for the cache tree walk. + * + * This function adjusts the existing key if the new key overlaps + * with the beginning of it. If the resulting key length is zero + * after the adjustment, the key is deleted. This indicates that + * the key no longer holds valid data and requires the tree to be + * re-researched for a new insertion point. + */ +static int fixup_overlap_head(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx) +{ + /* + * |--------| key_tmp + * |==========| key + */ + BUG_ON(cache_key_empty(key)); + /* Adjust key_tmp by cutting back based on the new key's start */ + cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key)); + if (key_tmp->len == 0) { + /* If the adjusted key_tmp length is zero, delete it */ + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + return SUBTREE_WALK_RET_OK; +} + +/** + * cache_key_insert - Insert a new cache key into the cache tree. + * @cache_tree: Pointer to the cache_tree structure. + * @key: The cache key to insert. + * @fixup: Indicates if this is a new key being inserted. + * + * This function searches for the appropriate location to insert + * a new cache key into the cache tree. It handles key overlaps + * and ensures any invalid keys are removed before insertion. + */ +void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup) +{ + struct pcache_cache *cache = cache_tree->cache; + struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 }; + struct rb_node **new, *parent = NULL; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key_tmp = NULL, *key_next; + struct rb_node *prev_node = NULL; + LIST_HEAD(delete_key_list); + int ret; + + cache_subtree = get_subtree(cache_tree, key->off); + key->cache_subtree = cache_subtree; +search: + prev_node = cache_subtree_search(cache_subtree, key, &parent, &new, &delete_key_list); + if (!list_empty(&delete_key_list)) { + /* Remove invalid keys from the delete list */ + list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) { + list_del_init(&key_tmp->list_node); + cache_key_delete(key_tmp); + } + goto search; + } + + if (fixup) { + /* Set up the context with the cache, start node, and new key */ + walk_ctx.cache_tree = cache_tree; + walk_ctx.start_node = prev_node; + walk_ctx.key = key; + + /* Assign overlap handling functions for different scenarios */ + walk_ctx.overlap_tail = fixup_overlap_tail; + walk_ctx.overlap_head = fixup_overlap_head; + walk_ctx.overlap_contain = fixup_overlap_contain; + walk_ctx.overlap_contained = fixup_overlap_contained; + + ret = cache_subtree_walk(&walk_ctx); + switch (ret) { + case SUBTREE_WALK_RET_OK: + break; + case SUBTREE_WALK_RET_RESEARCH: + goto search; + case SUBTREE_WALK_RET_NEED_KEY: + spin_unlock(&cache_subtree->tree_lock); + pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_key with GFP_NOIO"); + walk_ctx.pre_alloc_key = cache_key_alloc(cache_tree, GFP_NOIO); + spin_lock(&cache_subtree->tree_lock); + goto search; + default: + BUG(); + } + } + + if (walk_ctx.pre_alloc_key) + cache_key_put(walk_ctx.pre_alloc_key); + + /* Link and insert the new key into the red-black tree */ + rb_link_node(&key->rb_node, parent, new); + rb_insert_color(&key->rb_node, &cache_subtree->root); +} + +/** + * clean_fn - Cleanup function to remove invalid keys from the cache tree. + * @work: Pointer to the work_struct associated with the cleanup. + * + * This function cleans up invalid keys from the cache tree in the background + * after a cache segment has been invalidated during cache garbage collection. + * It processes a maximum of PCACHE_CLEAN_KEYS_MAX keys per iteration and holds + * the tree lock to ensure thread safety. + */ +void clean_fn(struct work_struct *work) +{ + struct pcache_cache *cache = container_of(work, struct pcache_cache, clean_work); + struct pcache_cache_subtree *cache_subtree; + struct rb_node *node; + struct pcache_cache_key *key; + int i, count; + + for (i = 0; i < cache->req_key_tree.n_subtrees; i++) { + cache_subtree = &cache->req_key_tree.subtrees[i]; + +again: + if (pcache_is_stopping(CACHE_TO_PCACHE(cache))) + return; + + /* Delete up to PCACHE_CLEAN_KEYS_MAX keys in one iteration */ + count = 0; + spin_lock(&cache_subtree->tree_lock); + node = rb_first(&cache_subtree->root); + while (node) { + key = CACHE_KEY(node); + node = rb_next(node); + if (cache_key_invalid(key)) { + count++; + cache_key_delete(key); + } + + if (count >= PCACHE_CLEAN_KEYS_MAX) { + /* Unlock and pause before continuing cleanup */ + spin_unlock(&cache_subtree->tree_lock); + usleep_range(1000, 2000); + goto again; + } + } + spin_unlock(&cache_subtree->tree_lock); + } +} + +/* + * kset_flush_fn - Flush work for a cache kset. + * + * This function is called when a kset flush work is queued from + * cache_key_append(). If the kset is full, it will be closed + * immediately. If not, the flush work will be queued for later closure. + * + * If cache_kset_close detects that a new segment is required to store + * the kset and there are no available segments, it will return an error. + * In this scenario, a retry will be attempted. + */ +void kset_flush_fn(struct work_struct *work) +{ + struct pcache_cache_kset *kset = container_of(work, struct pcache_cache_kset, flush_work.work); + struct pcache_cache *cache = kset->cache; + int ret; + + if (pcache_is_stopping(CACHE_TO_PCACHE(cache))) + return; + + spin_lock(&kset->kset_lock); + ret = cache_kset_close(cache, kset); + spin_unlock(&kset->kset_lock); + + if (ret) { + /* Failed to flush kset, schedule a retry. */ + queue_delayed_work(cache_get_wq(cache), &kset->flush_work, msecs_to_jiffies(100)); + } +} + +static int kset_replay(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia) +{ + struct pcache_cache_key_onmedia *key_onmedia; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key; + int ret; + int i; + + for (i = 0; i < kset_onmedia->key_num; i++) { + key_onmedia = &kset_onmedia->data[i]; + + key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO); + ret = cache_key_decode(cache, key_onmedia, key); + if (ret) { + cache_key_put(key); + goto err; + } + + __set_bit(key->cache_pos.cache_seg->cache_seg_id, cache->seg_map); + + /* Check if the segment generation is valid for insertion. */ + if (key->seg_gen < key->cache_pos.cache_seg->gen) { + cache_key_put(key); + } else { + cache_subtree = get_subtree(&cache->req_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); + cache_key_insert(&cache->req_key_tree, key, true); + spin_unlock(&cache_subtree->tree_lock); + } + + cache_seg_get(key->cache_pos.cache_seg); + } + + return 0; +err: + return ret; +} + +int cache_replay(struct pcache_cache *cache) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_pos pos_tail; + struct pcache_cache_pos *pos; + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 to_copy, count = 0; + int ret = 0; + + kset_onmedia = kzalloc(PCACHE_KSET_ONMEDIA_SIZE_MAX, GFP_KERNEL); + if (!kset_onmedia) + return -ENOMEM; + + cache_pos_copy(&pos_tail, &cache->key_tail); + pos = &pos_tail; + + /* + * In cache replaying stage, there is no other one will access + * cache->seg_map, so we can set bit here without cache->seg_map_lock. + */ + __set_bit(pos->cache_seg->cache_seg_id, cache->seg_map); + + while (true) { + to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - pos->seg_off); + ret = copy_mc_to_kernel(kset_onmedia, cache_pos_addr(pos), to_copy); + if (ret) { + ret = -EIO; + goto out; + } + + if (kset_onmedia->magic != PCACHE_KSET_MAGIC || + kset_onmedia->crc != cache_kset_crc(kset_onmedia)) { + break; + } + + /* Process the last kset and prepare for the next segment. */ + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) { + struct pcache_cache_segment *next_seg; + + pcache_dev_debug(pcache, "last kset replay, next: %u\n", kset_onmedia->next_cache_seg_id); + + next_seg = &cache->segments[kset_onmedia->next_cache_seg_id]; + + pos->cache_seg = next_seg; + pos->seg_off = 0; + + __set_bit(pos->cache_seg->cache_seg_id, cache->seg_map); + continue; + } + + /* Replay the kset and check for errors. */ + ret = kset_replay(cache, kset_onmedia); + if (ret) + goto out; + + /* Advance the position after processing the kset. */ + cache_pos_advance(pos, get_kset_onmedia_size(kset_onmedia)); + if (++count > 512) { + cond_resched(); + count = 0; + } + } + + /* Update the key_head position after replaying. */ + spin_lock(&cache->key_head_lock); + cache_pos_copy(&cache->key_head, pos); + spin_unlock(&cache->key_head_lock); +out: + kfree(kset_onmedia); + return ret; +} + +int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees) +{ + int ret; + u32 i; + + cache_tree->cache = cache; + cache_tree->n_subtrees = n_subtrees; + + ret = mempool_init_slab_pool(&cache_tree->key_pool, 1024, key_cache); + if (ret) + goto err; + + /* + * Allocate and initialize the subtrees array. + * Each element is a cache tree structure that contains + * an RB tree root and a spinlock for protecting its contents. + */ + cache_tree->subtrees = kvcalloc(cache_tree->n_subtrees, sizeof(struct pcache_cache_subtree), GFP_KERNEL); + if (!cache_tree->subtrees) { + ret = -ENOMEM; + goto key_pool_exit; + } + + for (i = 0; i < cache_tree->n_subtrees; i++) { + struct pcache_cache_subtree *cache_subtree = &cache_tree->subtrees[i]; + + cache_subtree->root = RB_ROOT; + spin_lock_init(&cache_subtree->tree_lock); + } + + return 0; + +key_pool_exit: + mempool_exit(&cache_tree->key_pool); +err: + return ret; +} + +void cache_tree_clear(struct pcache_cache_tree *cache_tree) +{ + struct pcache_cache_subtree *cache_subtree; + struct rb_node *node; + struct pcache_cache_key *key; + u32 i; + + for (i = 0; i < cache_tree->n_subtrees; i++) { + cache_subtree = &cache_tree->subtrees[i]; + + spin_lock(&cache_subtree->tree_lock); + node = rb_first(&cache_subtree->root); + while (node) { + key = CACHE_KEY(node); + node = rb_next(node); + + cache_key_delete(key); + } + spin_unlock(&cache_subtree->tree_lock); + } +} + +void cache_tree_exit(struct pcache_cache_tree *cache_tree) +{ + cache_tree_clear(cache_tree); + kvfree(cache_tree->subtrees); + mempool_exit(&cache_tree->key_pool); +} diff --git a/drivers/md/dm-pcache/cache_req.c b/drivers/md/dm-pcache/cache_req.c new file mode 100644 index 000000000000..7854a30e07b7 --- /dev/null +++ b/drivers/md/dm-pcache/cache_req.c @@ -0,0 +1,836 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +static int cache_data_head_init(struct pcache_cache *cache) +{ + struct pcache_cache_segment *next_seg; + struct pcache_cache_data_head *data_head; + + data_head = get_data_head(cache); + next_seg = get_cache_segment(cache); + if (!next_seg) + return -EBUSY; + + cache_seg_get(next_seg); + data_head->head_pos.cache_seg = next_seg; + data_head->head_pos.seg_off = 0; + + return 0; +} + +/** + * cache_data_alloc - Allocate data for a cache key. + * @cache: Pointer to the cache structure. + * @key: Pointer to the cache key to allocate data for. + * + * This function tries to allocate space from the cache segment specified by the + * data head. If the remaining space in the segment is insufficient to allocate + * the requested length for the cache key, it will allocate whatever is available + * and adjust the key's length accordingly. This function does not allocate + * space that crosses segment boundaries. + */ +static int cache_data_alloc(struct pcache_cache *cache, struct pcache_cache_key *key) +{ + struct pcache_cache_data_head *data_head; + struct pcache_cache_pos *head_pos; + struct pcache_cache_segment *cache_seg; + u32 seg_remain; + u32 allocated = 0, to_alloc; + int ret = 0; + + preempt_disable(); + data_head = get_data_head(cache); +again: + to_alloc = key->len - allocated; + if (!data_head->head_pos.cache_seg) { + seg_remain = 0; + } else { + cache_pos_copy(&key->cache_pos, &data_head->head_pos); + key->seg_gen = key->cache_pos.cache_seg->gen; + + head_pos = &data_head->head_pos; + cache_seg = head_pos->cache_seg; + seg_remain = cache_seg_remain(head_pos); + } + + if (seg_remain > to_alloc) { + /* If remaining space in segment is sufficient for the cache key, allocate it. */ + cache_pos_advance(head_pos, to_alloc); + allocated += to_alloc; + cache_seg_get(cache_seg); + } else if (seg_remain) { + /* If remaining space is not enough, allocate the remaining space and adjust the cache key length. */ + cache_pos_advance(head_pos, seg_remain); + key->len = seg_remain; + + /* Get for key: obtain a reference to the cache segment for the key. */ + cache_seg_get(cache_seg); + /* Put for head_pos->cache_seg: release the reference for the current head's segment. */ + cache_seg_put(head_pos->cache_seg); + head_pos->cache_seg = NULL; + } else { + /* Initialize a new data head if no segment is available. */ + ret = cache_data_head_init(cache); + if (ret) + goto out; + + goto again; + } + +out: + preempt_enable(); + + return ret; +} + +static int cache_copy_from_req_bio(struct pcache_cache *cache, struct pcache_cache_key *key, + struct pcache_request *pcache_req, u32 bio_off) +{ + struct pcache_cache_pos *pos = &key->cache_pos; + struct pcache_segment *segment; + + segment = &pos->cache_seg->segment; + + return segment_copy_from_bio(segment, pos->seg_off, key->len, pcache_req->bio, bio_off); +} + +static int cache_copy_to_req_bio(struct pcache_cache *cache, struct pcache_request *pcache_req, + u32 bio_off, u32 len, struct pcache_cache_pos *pos, u64 key_gen) +{ + struct pcache_cache_segment *cache_seg = pos->cache_seg; + struct pcache_segment *segment = &cache_seg->segment; + int ret; + + spin_lock(&cache_seg->gen_lock); + if (key_gen < cache_seg->gen) { + spin_unlock(&cache_seg->gen_lock); + return -EINVAL; + } + + ret = segment_copy_to_bio(segment, pos->seg_off, len, pcache_req->bio, bio_off); + spin_unlock(&cache_seg->gen_lock); + + return ret; +} + +/** + * miss_read_end_req - Handle the end of a miss read request. + * @backing_req: Pointer to the request structure. + * @read_ret: Return value of read. + * + * This function is called when a backing request to read data from + * the backing_dev is completed. If the key associated with the request + * is empty (a placeholder), it allocates cache space for the key, + * copies the data read from the bio into the cache, and updates + * the key's status. If the key has been overwritten by a write + * request during this process, it will be deleted from the cache + * tree and no further action will be taken. + */ +static void miss_read_end_req(struct pcache_backing_dev_req *backing_req, int read_ret) +{ + void *priv_data = backing_req->priv_data; + struct pcache_request *pcache_req = backing_req->req.upper_req; + struct pcache_cache *cache = backing_req->backing_dev->cache; + int ret; + + if (priv_data) { + struct pcache_cache_key *key; + struct pcache_cache_subtree *cache_subtree; + + key = (struct pcache_cache_key *)priv_data; + cache_subtree = key->cache_subtree; + + /* if this key was deleted from cache_subtree by a write, key->flags should be cleared, + * so if cache_key_empty() return true, this key is still in cache_subtree + */ + spin_lock(&cache_subtree->tree_lock); + if (cache_key_empty(key)) { + /* Check if the backing request was successful. */ + if (read_ret) { + cache_key_delete(key); + goto unlock; + } + + /* Allocate cache space for the key and copy data from the backing_dev. */ + ret = cache_data_alloc(cache, key); + if (ret) { + cache_key_delete(key); + goto unlock; + } + + ret = cache_copy_from_req_bio(cache, key, pcache_req, backing_req->req.bio_off); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_delete(key); + goto unlock; + } + key->flags &= ~PCACHE_CACHE_KEY_FLAGS_EMPTY; + key->flags |= PCACHE_CACHE_KEY_FLAGS_CLEAN; + + /* Append the key to the cache. */ + ret = cache_key_append(cache, key, false); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_delete(key); + goto unlock; + } + } +unlock: + spin_unlock(&cache_subtree->tree_lock); + cache_key_put(key); + } +} + +/** + * submit_cache_miss_req - Submit a backing request when cache data is missing + * @cache: The cache context that manages cache operations + * @backing_req: The cache request containing information about the read request + * + * This function is used to handle cases where a cache read request cannot locate + * the required data in the cache. When such a miss occurs during `cache_subtree_walk`, + * it triggers a backing read request to fetch data from the backing storage. + * + * If `pcache_req->priv_data` is set, it points to a `pcache_cache_key`, representing + * a new cache key to be inserted into the cache. The function calls `cache_key_insert` + * to attempt adding the key. On insertion failure, it releases the key reference and + * clears `priv_data` to avoid further processing. + */ +static void submit_cache_miss_req(struct pcache_cache *cache, struct pcache_backing_dev_req *backing_req) +{ + if (backing_req->priv_data) { + struct pcache_cache_key *key; + + /* Attempt to insert the key into the cache if priv_data is set */ + key = (struct pcache_cache_key *)backing_req->priv_data; + cache_key_insert(&cache->req_key_tree, key, true); + } + backing_dev_req_submit(backing_req, false); +} + +static void cache_miss_req_free(struct pcache_backing_dev_req *backing_req) +{ + struct pcache_cache_key *key; + + if (backing_req->priv_data) { + key = backing_req->priv_data; + backing_req->priv_data = NULL; + cache_key_put(key); /* for ->priv_data */ + cache_key_put(key); /* for init ref in alloc */ + } + + backing_dev_req_end(backing_req); +} + +static struct pcache_backing_dev_req *cache_miss_req_alloc(struct pcache_cache *cache, + struct pcache_request *parent, + gfp_t gfp_mask) +{ + struct pcache_backing_dev *backing_dev = cache->backing_dev; + struct pcache_backing_dev_req *backing_req; + struct pcache_cache_key *key = NULL; + struct pcache_backing_dev_req_opts req_opts = { 0 }; + + req_opts.type = BACKING_DEV_REQ_TYPE_REQ; + req_opts.gfp_mask = gfp_mask; + req_opts.req.upper_req = parent; + + backing_req = backing_dev_req_alloc(backing_dev, &req_opts); + if (!backing_req) + return NULL; + + key = cache_key_alloc(&cache->req_key_tree, gfp_mask); + if (!key) + goto free_backing_req; + + cache_key_get(key); + backing_req->priv_data = key; + + return backing_req; + +free_backing_req: + cache_miss_req_free(backing_req); + return NULL; +} + +static void cache_miss_req_init(struct pcache_cache *cache, + struct pcache_backing_dev_req *backing_req, + struct pcache_request *parent, + u32 off, u32 len, bool insert_key) +{ + struct pcache_cache_key *key; + struct pcache_backing_dev_req_opts req_opts = { 0 }; + + req_opts.type = BACKING_DEV_REQ_TYPE_REQ; + req_opts.req.upper_req = parent; + req_opts.req.req_off = off; + req_opts.req.len = len; + req_opts.end_fn = miss_read_end_req; + + backing_dev_req_init(backing_req, &req_opts); + + if (insert_key) { + key = backing_req->priv_data; + key->off = parent->off + off; + key->len = len; + key->flags |= PCACHE_CACHE_KEY_FLAGS_EMPTY; + } else { + key = backing_req->priv_data; + backing_req->priv_data = NULL; + cache_key_put(key); + cache_key_put(key); + } +} + +static struct pcache_backing_dev_req *get_pre_alloc_req(struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_request *pcache_req = ctx->pcache_req; + struct pcache_backing_dev_req *backing_req; + + if (ctx->pre_alloc_req) { + backing_req = ctx->pre_alloc_req; + ctx->pre_alloc_req = NULL; + + return backing_req; + } + + return cache_miss_req_alloc(cache, pcache_req, GFP_NOWAIT); +} + +/* + * In the process of walking the cache tree to locate cached data, this + * function handles the situation where the requested data range lies + * entirely before an existing cache node (`key_tmp`). This outcome + * signifies that the target data is absent from the cache (cache miss). + * + * To fulfill this portion of the read request, the function creates a + * backing request (`backing_req`) for the missing data range represented + * by `key`. It then appends this request to the submission list in the + * `ctx`, which will later be processed to retrieve the data from backing + * storage. After setting up the backing request, `req_done` in `ctx` is + * updated to reflect the length of the handled range, and the range + * in `key` is adjusted by trimming off the portion that is now handled. + * + * The scenario handled here: + * + * |--------| key_tmp (existing cached range) + * |====| key (requested range, preceding key_tmp) + * + * Since `key` is before `key_tmp`, it signifies that the requested data + * range is missing in the cache (cache miss) and needs retrieval from + * backing storage. + */ +static int read_before(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_backing_dev_req *backing_req; + struct pcache_cache *cache = ctx->cache_tree->cache; + + /* + * In this scenario, `key` represents a range that precedes `key_tmp`, + * meaning the requested data range is missing from the cache tree + * and must be retrieved from the backing_dev. + */ + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true); + + list_add(&backing_req->node, ctx->submit_req_list); + ctx->req_done += key->len; + cache_key_cutfront(key, key->len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * During cache_subtree_walk, this function manages a scenario where part of the + * requested data range overlaps with an existing cache node (`key_tmp`). + * + * |----------------| key_tmp (existing cached range) + * |===========| key (requested range, overlapping the tail of key_tmp) + */ +static int read_overlap_tail(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + u32 io_len; + int ret; + + /* + * Calculate the length of the non-overlapping portion of `key` + * before `key_tmp`, representing the data missing in the cache. + */ + io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key); + if (io_len) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true); + + list_add(&backing_req->node, ctx->submit_req_list); + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + } + + /* + * Handle the overlapping portion by calculating the length of + * the remaining data in `key` that coincides with `key_tmp`. + */ + io_len = cache_key_lend(key) - cache_key_lstart(key_tmp); + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false); + submit_cache_miss_req(cache, backing_req); + } else { + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + io_len, &key_tmp->cache_pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * |----| key_tmp (existing cached range) + * |==========| key (requested range) + */ +static int read_overlap_contain(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + u32 io_len; + int ret; + + /* + * Calculate the non-overlapping part of `key` before `key_tmp` + * to identify the missing data length. + */ + io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key); + if (io_len) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true); + + list_add(&backing_req->node, ctx->submit_req_list); + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + } + + /* + * Handle the overlapping portion between `key` and `key_tmp`. + */ + io_len = key_tmp->len; + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false); + submit_cache_miss_req(cache, backing_req); + } else { + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + io_len, &key_tmp->cache_pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * |-----------| key_tmp (existing cached range) + * |====| key (requested range, fully within key_tmp) + * + * If `key_tmp` contains valid cached data, this function copies the relevant + * portion to the request's bio. Otherwise, it sends a backing request to + * fetch the required data range. + */ +static int read_overlap_contained(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + struct pcache_cache_pos pos; + int ret; + + /* + * Check if `key_tmp` is empty, indicating a miss. If so, initiate + * a backing request to fetch the required data for `key`. + */ + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, false); + submit_cache_miss_req(cache, backing_req); + } else { + cache_pos_copy(&pos, &key_tmp->cache_pos); + cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp)); + + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + key->len, &pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += key->len; + cache_key_cutfront(key, key->len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * |--------| key_tmp (existing cached range) + * |==========| key (requested range, overlapping the head of key_tmp) + */ +static int read_overlap_head(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + struct pcache_cache_pos pos; + u32 io_len; + int ret; + + io_len = cache_key_lend(key_tmp) - cache_key_lstart(key); + + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false); + submit_cache_miss_req(cache, backing_req); + } else { + cache_pos_copy(&pos, &key_tmp->cache_pos); + cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp)); + + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + io_len, &pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + + return SUBTREE_WALK_RET_OK; +} + +/** + * read_walk_finally - Finalizes the cache read tree walk by submitting any + * remaining backing requests + * @ctx: Context structure holding information about the cache, + * read request, and submission list + * @ret: the return value after this walk. + * + * This function is called at the end of the `cache_subtree_walk` during a + * cache read operation. It completes the walk by checking if any data + * requested by `key` was not found in the cache tree, and if so, it sends + * a backing request to retrieve that data. Then, it iterates through the + * submission list of backing requests created during the walk, removing + * each request from the list and submitting it. + * + * The scenario managed here includes: + * - Sending a backing request for the remaining length of `key` if it was + * not fulfilled by existing cache entries. + * - Iterating through `ctx->submit_req_list` to submit each backing request + * enqueued during the walk. + * + * This ensures all necessary backing requests for cache misses are submitted + * to the backing storage to retrieve any data that could not be found in + * the cache. + */ +static int read_walk_finally(struct pcache_cache_subtree_walk_ctx *ctx, int ret) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req, *next_req; + struct pcache_cache_key *key = ctx->key; + + list_for_each_entry_safe(backing_req, next_req, ctx->submit_req_list, node) { + list_del_init(&backing_req->node); + submit_cache_miss_req(ctx->cache_tree->cache, backing_req); + } + + if (ret != SUBTREE_WALK_RET_OK) + return ret; + + if (key->len) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true); + submit_cache_miss_req(cache, backing_req); + ctx->req_done += key->len; + } + + return SUBTREE_WALK_RET_OK; +} + +/* + * This function is used within `cache_subtree_walk` to determine whether the + * read operation has covered the requested data length. It compares the + * amount of data processed (`ctx->req_done`) with the total data length + * specified in the original request (`ctx->pcache_req->data_len`). + * + * If `req_done` meets or exceeds the required data length, the function + * returns `true`, indicating the walk is complete. Otherwise, it returns `false`, + * signaling that additional data processing is needed to fulfill the request. + */ +static bool read_walk_done(struct pcache_cache_subtree_walk_ctx *ctx) +{ + return (ctx->req_done >= ctx->pcache_req->data_len); +} + +/** + * cache_read - Process a read request by traversing the cache tree + * @cache: Cache structure holding cache trees and related configurations + * @pcache_req: Request structure with information about the data to read + * + * This function attempts to fulfill a read request by traversing the cache tree(s) + * to locate cached data for the requested range. If parts of the data are missing + * in the cache, backing requests are generated to retrieve the required segments. + * + * The function operates by initializing a key for the requested data range and + * preparing a context (`walk_ctx`) to manage the cache tree traversal. The context + * includes pointers to functions (e.g., `read_before`, `read_overlap_tail`) that handle + * specific conditions encountered during the traversal. The `walk_finally` and `walk_done` + * functions manage the end stages of the traversal, while the `delete_key_list` and + * `submit_req_list` lists track any keys to be deleted or requests to be submitted. + * + * The function first calculates the requested range and checks if it fits within the + * current cache tree (based on the tree's size limits). It then locks the cache tree + * and performs a search to locate any matching keys. If there are outdated keys, + * these are deleted, and the search is restarted to ensure accurate data retrieval. + * + * If the requested range spans multiple cache trees, the function moves on to the + * next tree once the current range has been processed. This continues until the + * entire requested data length has been handled. + */ +static int cache_read(struct pcache_cache *cache, struct pcache_request *pcache_req) +{ + struct pcache_cache_key key_data = { .off = pcache_req->off, .len = pcache_req->data_len }; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key_tmp = NULL, *key_next; + struct rb_node *prev_node = NULL; + struct pcache_cache_key *key = &key_data; + struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 }; + struct pcache_backing_dev_req *backing_req, *next_req; + LIST_HEAD(delete_key_list); + LIST_HEAD(submit_req_list); + int ret; + + walk_ctx.cache_tree = &cache->req_key_tree; + walk_ctx.req_done = 0; + walk_ctx.pcache_req = pcache_req; + walk_ctx.before = read_before; + walk_ctx.overlap_tail = read_overlap_tail; + walk_ctx.overlap_head = read_overlap_head; + walk_ctx.overlap_contain = read_overlap_contain; + walk_ctx.overlap_contained = read_overlap_contained; + walk_ctx.walk_finally = read_walk_finally; + walk_ctx.walk_done = read_walk_done; + walk_ctx.delete_key_list = &delete_key_list; + walk_ctx.submit_req_list = &submit_req_list; + +next: + key->off = pcache_req->off + walk_ctx.req_done; + key->len = pcache_req->data_len - walk_ctx.req_done; + if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK)) + key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK); + + cache_subtree = get_subtree(&cache->req_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); +search: + prev_node = cache_subtree_search(cache_subtree, key, NULL, NULL, &delete_key_list); + if (!list_empty(&delete_key_list)) { + list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) { + list_del_init(&key_tmp->list_node); + cache_key_delete(key_tmp); + } + goto search; + } + + walk_ctx.start_node = prev_node; + walk_ctx.key = key; + + ret = cache_subtree_walk(&walk_ctx); + if (ret == SUBTREE_WALK_RET_RESEARCH) + goto search; + spin_unlock(&cache_subtree->tree_lock); + + if (ret == SUBTREE_WALK_RET_ERR) { + ret = walk_ctx.ret; + goto out; + } + + if (ret == SUBTREE_WALK_RET_NEED_REQ) { + walk_ctx.pre_alloc_req = cache_miss_req_alloc(cache, pcache_req, GFP_NOIO); + pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_req with GFP_NOIO"); + } + + if (walk_ctx.req_done < pcache_req->data_len) + goto next; + ret = 0; +out: + if (walk_ctx.pre_alloc_req) + cache_miss_req_free(walk_ctx.pre_alloc_req); + + list_for_each_entry_safe(backing_req, next_req, &submit_req_list, node) { + list_del_init(&backing_req->node); + backing_dev_req_end(backing_req); + } + + return ret; +} + +static int cache_write(struct pcache_cache *cache, struct pcache_request *pcache_req) +{ + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key; + u64 offset = pcache_req->off; + u32 length = pcache_req->data_len; + u32 io_done = 0; + int ret; + + while (true) { + if (io_done >= length) + break; + + key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO); + key->off = offset + io_done; + key->len = length - io_done; + if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK)) + key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK); + + ret = cache_data_alloc(cache, key); + if (ret) { + cache_key_put(key); + goto err; + } + + ret = cache_copy_from_req_bio(cache, key, pcache_req, io_done); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_put(key); + goto err; + } + + cache_subtree = get_subtree(&cache->req_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); + cache_key_insert(&cache->req_key_tree, key, true); + ret = cache_key_append(cache, key, pcache_req->bio->bi_opf & REQ_FUA); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_delete(key); + goto unlock; + } + + io_done += key->len; + spin_unlock(&cache_subtree->tree_lock); + } + + return 0; +unlock: + spin_unlock(&cache_subtree->tree_lock); +err: + return ret; +} + +/** + * pcache_cache_flush - Flush all ksets to persist any pending cache data + * @cache: Pointer to the cache structure + * + * This function iterates through all ksets associated with the provided `cache` + * and ensures that any data marked for persistence is written to media. For each + * kset, it acquires the kset lock, then invokes `cache_kset_close`, which handles + * the persistence logic for that kset. + * + * If `cache_kset_close` encounters an error, the function exits immediately with + * the respective error code, preventing the flush operation from proceeding to + * subsequent ksets. + */ +int pcache_cache_flush(struct pcache_cache *cache) +{ + struct pcache_cache_kset *kset; + int ret; + u32 i; + + for (i = 0; i < cache->n_ksets; i++) { + kset = get_kset(cache, i); + + spin_lock(&kset->kset_lock); + ret = cache_kset_close(cache, kset); + spin_unlock(&kset->kset_lock); + + if (ret) + return ret; + } + + return 0; +} + +int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req) +{ + struct bio *bio = pcache_req->bio; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) + return pcache_cache_flush(cache); + + if (bio_data_dir(bio) == READ) + return cache_read(cache, pcache_req); + + return cache_write(cache, pcache_req); +} diff --git a/drivers/md/dm-pcache/cache_segment.c b/drivers/md/dm-pcache/cache_segment.c new file mode 100644 index 000000000000..f0b58980806e --- /dev/null +++ b/drivers/md/dm-pcache/cache_segment.c @@ -0,0 +1,305 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "cache_dev.h" +#include "cache.h" +#include "backing_dev.h" +#include "dm_pcache.h" + +static inline struct pcache_segment_info *get_seg_info_addr(struct pcache_cache_segment *cache_seg) +{ + struct pcache_segment_info *seg_info_addr; + u32 seg_id = cache_seg->segment.seg_id; + void *seg_addr; + + seg_addr = CACHE_DEV_SEGMENT(cache_seg->cache->cache_dev, seg_id); + seg_info_addr = seg_addr + PCACHE_SEG_INFO_SIZE * cache_seg->info_index; + + return seg_info_addr; +} + +static void cache_seg_info_write(struct pcache_cache_segment *cache_seg) +{ + struct pcache_segment_info *seg_info_addr; + struct pcache_segment_info *seg_info = &cache_seg->cache_seg_info; + + mutex_lock(&cache_seg->info_lock); + seg_info->header.seq++; + seg_info->header.crc = pcache_meta_crc(&seg_info->header, sizeof(struct pcache_segment_info)); + + seg_info_addr = get_seg_info_addr(cache_seg); + memcpy_flushcache(seg_info_addr, seg_info, sizeof(struct pcache_segment_info)); + pmem_wmb(); + + cache_seg->info_index = (cache_seg->info_index + 1) % PCACHE_META_INDEX_MAX; + mutex_unlock(&cache_seg->info_lock); +} + +static int cache_seg_info_load(struct pcache_cache_segment *cache_seg) +{ + struct pcache_segment_info *cache_seg_info_addr_base, *cache_seg_info_addr; + struct pcache_cache_dev *cache_dev = cache_seg->cache->cache_dev; + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + u32 seg_id = cache_seg->segment.seg_id; + int ret = 0; + + cache_seg_info_addr_base = CACHE_DEV_SEGMENT(cache_dev, seg_id); + + mutex_lock(&cache_seg->info_lock); + cache_seg_info_addr = pcache_meta_find_latest(&cache_seg_info_addr_base->header, + sizeof(struct pcache_segment_info), + PCACHE_SEG_INFO_SIZE, + &cache_seg->cache_seg_info); + if (IS_ERR(cache_seg_info_addr)) { + ret = PTR_ERR(cache_seg_info_addr); + goto out; + } else if (!cache_seg_info_addr) { + ret = -EIO; + goto out; + } + cache_seg->info_index = cache_seg_info_addr - cache_seg_info_addr_base; +out: + mutex_unlock(&cache_seg->info_lock); + + if (ret) + pcache_dev_err(pcache, "can't read segment info of segment: %u, ret: %d\n", + cache_seg->segment.seg_id, ret); + return ret; +} + +static int cache_seg_ctrl_load(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl; + struct pcache_cache_seg_gen cache_seg_gen, *cache_seg_gen_addr; + int ret = 0; + + cache_seg_gen_addr = pcache_meta_find_latest(&cache_seg_ctrl->gen->header, + sizeof(struct pcache_cache_seg_gen), + sizeof(struct pcache_cache_seg_gen), + &cache_seg_gen); + if (IS_ERR(cache_seg_gen_addr)) { + ret = PTR_ERR(cache_seg_gen_addr); + goto out; + } + + if (!cache_seg_gen_addr) { + cache_seg->gen = 0; + cache_seg->gen_seq = 0; + cache_seg->gen_index = 0; + goto out; + } + + cache_seg->gen = cache_seg_gen.gen; + cache_seg->gen_seq = cache_seg_gen.header.seq; + cache_seg->gen_index = (cache_seg_gen_addr - cache_seg_ctrl->gen); +out: + + return ret; +} + +static inline struct pcache_cache_seg_gen *get_cache_seg_gen_addr(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl; + + return (cache_seg_ctrl->gen + cache_seg->gen_index); +} + +/* + * cache_seg_ctrl_write - write cache segment control information + * @seg: the cache segment to update + * + * This function writes the control information of a cache segment to media. + * + * Although this updates shared control data, we intentionally do not use + * any locking here. All accesses to control information are single-threaded: + * + * - All reads occur during the init phase, where no concurrent writes + * can happen. + * - Writes happen once during init and once when the last reference + * to the segment is dropped in cache_seg_put(). + * + * Both cases are guaranteed to be single-threaded, so there is no risk + * of concurrent read/write races. + */ +static void cache_seg_ctrl_write(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache_seg_gen cache_seg_gen; + + cache_seg_gen.gen = cache_seg->gen; + cache_seg_gen.header.seq = ++cache_seg->gen_seq; + cache_seg_gen.header.crc = pcache_meta_crc(&cache_seg_gen.header, + sizeof(struct pcache_cache_seg_gen)); + + memcpy_flushcache(get_cache_seg_gen_addr(cache_seg), &cache_seg_gen, sizeof(struct pcache_cache_seg_gen)); + pmem_wmb(); + + cache_seg->gen_index = (cache_seg->gen_index + 1) % PCACHE_META_INDEX_MAX; +} + +static void cache_seg_ctrl_init(struct pcache_cache_segment *cache_seg) +{ + cache_seg->gen = 0; + cache_seg->gen_seq = 0; + cache_seg->gen_index = 0; + cache_seg_ctrl_write(cache_seg); +} + +static int cache_seg_meta_load(struct pcache_cache_segment *cache_seg) +{ + int ret; + + ret = cache_seg_info_load(cache_seg); + if (ret) + goto err; + + ret = cache_seg_ctrl_load(cache_seg); + if (ret) + goto err; + + return 0; +err: + return ret; +} + +/** + * cache_seg_set_next_seg - Sets the ID of the next segment + * @cache_seg: Pointer to the cache segment structure. + * @seg_id: The segment ID to set as the next segment. + * + * A pcache_cache allocates multiple cache segments, which are linked together + * through next_seg. When loading a pcache_cache, the first cache segment can + * be found using cache->seg_id, which allows access to all the cache segments. + */ +void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id) +{ + cache_seg->cache_seg_info.flags |= PCACHE_SEG_INFO_FLAGS_HAS_NEXT; + cache_seg->cache_seg_info.next_seg = seg_id; + cache_seg_info_write(cache_seg); +} + +int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id, + bool new_cache) +{ + struct pcache_cache_dev *cache_dev = cache->cache_dev; + struct pcache_cache_segment *cache_seg = &cache->segments[cache_seg_id]; + struct pcache_segment_init_options seg_options = { 0 }; + struct pcache_segment *segment = &cache_seg->segment; + int ret; + + cache_seg->cache = cache; + cache_seg->cache_seg_id = cache_seg_id; + spin_lock_init(&cache_seg->gen_lock); + atomic_set(&cache_seg->refs, 0); + mutex_init(&cache_seg->info_lock); + + /* init pcache_segment */ + seg_options.type = PCACHE_SEGMENT_TYPE_CACHE_DATA; + seg_options.data_off = PCACHE_CACHE_SEG_CTRL_OFF + PCACHE_CACHE_SEG_CTRL_SIZE; + seg_options.seg_id = seg_id; + seg_options.seg_info = &cache_seg->cache_seg_info; + pcache_segment_init(cache_dev, segment, &seg_options); + + cache_seg->cache_seg_ctrl = CACHE_DEV_SEGMENT(cache_dev, seg_id) + PCACHE_CACHE_SEG_CTRL_OFF; + + if (new_cache) { + cache_dev_zero_range(cache_dev, CACHE_DEV_SEGMENT(cache_dev, seg_id), + PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX + + PCACHE_CACHE_SEG_CTRL_SIZE); + + cache_seg_ctrl_init(cache_seg); + + cache_seg->info_index = 0; + cache_seg_info_write(cache_seg); + + /* clear outdated kset in segment */ + memcpy_flushcache(segment->data, &pcache_empty_kset, sizeof(struct pcache_cache_kset_onmedia)); + pmem_wmb(); + } else { + ret = cache_seg_meta_load(cache_seg); + if (ret) + goto err; + } + + return 0; +err: + return ret; +} + +/** + * get_cache_segment - Retrieves a free cache segment from the cache. + * @cache: Pointer to the cache structure. + * + * This function attempts to find a free cache segment that can be used. + * It locks the segment map and checks for the next available segment ID. + * If a free segment is found, it initializes it and returns a pointer to the + * cache segment structure. Returns NULL if no segments are available. + */ +struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache) +{ + struct pcache_cache_segment *cache_seg; + u32 seg_id; + + spin_lock(&cache->seg_map_lock); +again: + seg_id = find_next_zero_bit(cache->seg_map, cache->n_segs, cache->last_cache_seg); + if (seg_id == cache->n_segs) { + /* reset the hint of ->last_cache_seg and retry */ + if (cache->last_cache_seg) { + cache->last_cache_seg = 0; + goto again; + } + cache->cache_full = true; + spin_unlock(&cache->seg_map_lock); + return NULL; + } + + /* + * found an available cache_seg, mark it used in seg_map + * and update the search hint ->last_cache_seg + */ + __set_bit(seg_id, cache->seg_map); + cache->last_cache_seg = seg_id; + spin_unlock(&cache->seg_map_lock); + + cache_seg = &cache->segments[seg_id]; + cache_seg->cache_seg_id = seg_id; + + return cache_seg; +} + +static void cache_seg_gen_increase(struct pcache_cache_segment *cache_seg) +{ + spin_lock(&cache_seg->gen_lock); + cache_seg->gen++; + spin_unlock(&cache_seg->gen_lock); + + cache_seg_ctrl_write(cache_seg); +} + +void cache_seg_get(struct pcache_cache_segment *cache_seg) +{ + atomic_inc(&cache_seg->refs); +} + +static void cache_seg_invalidate(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache *cache; + + cache = cache_seg->cache; + cache_seg_gen_increase(cache_seg); + + spin_lock(&cache->seg_map_lock); + if (cache->cache_full) + cache->cache_full = false; + __clear_bit(cache_seg->cache_seg_id, cache->seg_map); + spin_unlock(&cache->seg_map_lock); + + pcache_defer_reqs_kick(CACHE_TO_PCACHE(cache)); + /* clean_work will clean the bad key in key_tree*/ + queue_work(cache_get_wq(cache), &cache->clean_work); +} + +void cache_seg_put(struct pcache_cache_segment *cache_seg) +{ + if (atomic_dec_and_test(&cache_seg->refs)) + cache_seg_invalidate(cache_seg); +} diff --git a/drivers/md/dm-pcache/cache_writeback.c b/drivers/md/dm-pcache/cache_writeback.c new file mode 100644 index 000000000000..87a82b3fe836 --- /dev/null +++ b/drivers/md/dm-pcache/cache_writeback.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/bio.h> + +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +static void writeback_ctx_end(struct pcache_cache *cache, int ret) +{ + if (ret && !cache->writeback_ctx.ret) { + pcache_dev_err(CACHE_TO_PCACHE(cache), "writeback error: %d", ret); + cache->writeback_ctx.ret = ret; + } + + if (!atomic_dec_and_test(&cache->writeback_ctx.pending)) + return; + + if (!cache->writeback_ctx.ret) { + backing_dev_flush(cache->backing_dev); + + mutex_lock(&cache->dirty_tail_lock); + cache_pos_advance(&cache->dirty_tail, cache->writeback_ctx.advance); + cache_encode_dirty_tail(cache); + mutex_unlock(&cache->dirty_tail_lock); + } + queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0); +} + +static void writeback_end_req(struct pcache_backing_dev_req *backing_req, int ret) +{ + struct pcache_cache *cache = backing_req->priv_data; + + mutex_lock(&cache->writeback_lock); + writeback_ctx_end(cache, ret); + mutex_unlock(&cache->writeback_lock); +} + +static inline bool is_cache_clean(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 to_copy; + void *addr; + int ret; + + addr = cache_pos_addr(dirty_tail); + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf; + + to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - dirty_tail->seg_off); + ret = copy_mc_to_kernel(kset_onmedia, addr, to_copy); + if (ret) { + pcache_dev_err(pcache, "error to read kset: %d", ret); + return true; + } + + /* Check if the magic number matches the expected value */ + if (kset_onmedia->magic != PCACHE_KSET_MAGIC) { + pcache_dev_debug(pcache, "dirty_tail: %u:%u magic: %llx, not expected: %llx\n", + dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off, + kset_onmedia->magic, PCACHE_KSET_MAGIC); + return true; + } + + /* Verify the CRC checksum for data integrity */ + if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) { + pcache_dev_debug(pcache, "dirty_tail: %u:%u crc: %x, not expected: %x\n", + dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off, + cache_kset_crc(kset_onmedia), kset_onmedia->crc); + return true; + } + + return false; +} + +void cache_writeback_exit(struct pcache_cache *cache) +{ + cancel_delayed_work_sync(&cache->writeback_work); + backing_dev_flush(cache->backing_dev); + cache_tree_exit(&cache->writeback_key_tree); +} + +int cache_writeback_init(struct pcache_cache *cache) +{ + int ret; + + ret = cache_tree_init(cache, &cache->writeback_key_tree, 1); + if (ret) + goto err; + + atomic_set(&cache->writeback_ctx.pending, 0); + + /* Queue delayed work to start writeback handling */ + queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0); + + return 0; +err: + return ret; +} + +static void cache_key_writeback(struct pcache_cache *cache, struct pcache_cache_key *key) +{ + struct pcache_backing_dev_req *writeback_req; + struct pcache_backing_dev_req_opts writeback_req_opts = { 0 }; + struct pcache_cache_pos *pos; + void *addr; + u32 seg_remain, req_len, done = 0; + + if (cache_key_clean(key)) + return; + + pos = &key->cache_pos; + + seg_remain = cache_seg_remain(pos); + BUG_ON(seg_remain < key->len); +next_req: + addr = cache_pos_addr(pos) + done; + req_len = backing_dev_req_coalesced_max_len(addr, key->len - done); + + writeback_req_opts.type = BACKING_DEV_REQ_TYPE_KMEM; + writeback_req_opts.gfp_mask = GFP_NOIO; + writeback_req_opts.end_fn = writeback_end_req; + writeback_req_opts.priv_data = cache; + + writeback_req_opts.kmem.data = addr; + writeback_req_opts.kmem.opf = REQ_OP_WRITE; + writeback_req_opts.kmem.len = req_len; + writeback_req_opts.kmem.backing_off = key->off + done; + + writeback_req = backing_dev_req_create(cache->backing_dev, &writeback_req_opts); + + atomic_inc(&cache->writeback_ctx.pending); + backing_dev_req_submit(writeback_req, true); + + done += req_len; + if (done < key->len) + goto next_req; +} + +static void cache_wb_tree_writeback(struct pcache_cache *cache, u32 advance) +{ + struct pcache_cache_tree *cache_tree = &cache->writeback_key_tree; + struct pcache_cache_subtree *cache_subtree; + struct rb_node *node; + struct pcache_cache_key *key; + u32 i; + + cache->writeback_ctx.ret = 0; + cache->writeback_ctx.advance = advance; + atomic_set(&cache->writeback_ctx.pending, 1); + + for (i = 0; i < cache_tree->n_subtrees; i++) { + cache_subtree = &cache_tree->subtrees[i]; + + node = rb_first(&cache_subtree->root); + while (node) { + key = CACHE_KEY(node); + node = rb_next(node); + + cache_key_writeback(cache, key); + cache_key_delete(key); + } + } + writeback_ctx_end(cache, 0); +} + +static int cache_kset_insert_tree(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia) +{ + struct pcache_cache_key_onmedia *key_onmedia; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key; + int ret; + u32 i; + + /* Iterate through all keys in the kset and write each back to storage */ + for (i = 0; i < kset_onmedia->key_num; i++) { + key_onmedia = &kset_onmedia->data[i]; + + key = cache_key_alloc(&cache->writeback_key_tree, GFP_NOIO); + ret = cache_key_decode(cache, key_onmedia, key); + if (ret) { + cache_key_put(key); + goto clear_tree; + } + + cache_subtree = get_subtree(&cache->writeback_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); + cache_key_insert(&cache->writeback_key_tree, key, true); + spin_unlock(&cache_subtree->tree_lock); + } + + return 0; +clear_tree: + cache_tree_clear(&cache->writeback_key_tree); + return ret; +} + +static void last_kset_writeback(struct pcache_cache *cache, + struct pcache_cache_kset_onmedia *last_kset_onmedia) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_segment *next_seg; + + pcache_dev_debug(pcache, "last kset, next: %u\n", last_kset_onmedia->next_cache_seg_id); + + next_seg = &cache->segments[last_kset_onmedia->next_cache_seg_id]; + + mutex_lock(&cache->dirty_tail_lock); + cache->dirty_tail.cache_seg = next_seg; + cache->dirty_tail.seg_off = 0; + cache_encode_dirty_tail(cache); + mutex_unlock(&cache->dirty_tail_lock); +} + +void cache_writeback_fn(struct work_struct *work) +{ + struct pcache_cache *cache = container_of(work, struct pcache_cache, writeback_work.work); + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_pos dirty_tail; + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 delay; + int ret; + + mutex_lock(&cache->writeback_lock); + if (atomic_read(&cache->writeback_ctx.pending)) + goto unlock; + + if (pcache_is_stopping(pcache)) + goto unlock; + + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf; + + mutex_lock(&cache->dirty_tail_lock); + cache_pos_copy(&dirty_tail, &cache->dirty_tail); + mutex_unlock(&cache->dirty_tail_lock); + + if (is_cache_clean(cache, &dirty_tail)) { + delay = PCACHE_CACHE_WRITEBACK_INTERVAL; + goto queue_work; + } + + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) { + last_kset_writeback(cache, kset_onmedia); + delay = 0; + goto queue_work; + } + + ret = cache_kset_insert_tree(cache, kset_onmedia); + if (ret) { + delay = PCACHE_CACHE_WRITEBACK_INTERVAL; + goto queue_work; + } + + cache_wb_tree_writeback(cache, get_kset_onmedia_size(kset_onmedia)); + delay = 0; +queue_work: + queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, delay); +unlock: + mutex_unlock(&cache->writeback_lock); +} diff --git a/drivers/md/dm-pcache/dm_pcache.c b/drivers/md/dm-pcache/dm_pcache.c new file mode 100644 index 000000000000..e5f5936fa6f0 --- /dev/null +++ b/drivers/md/dm-pcache/dm_pcache.c @@ -0,0 +1,497 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/bio.h> + +#include "../dm-core.h" +#include "cache_dev.h" +#include "backing_dev.h" +#include "cache.h" +#include "dm_pcache.h" + +void pcache_defer_reqs_kick(struct dm_pcache *pcache) +{ + struct pcache_cache *cache = &pcache->cache; + + spin_lock(&cache->seg_map_lock); + if (!cache->cache_full) + queue_work(pcache->task_wq, &pcache->defered_req_work); + spin_unlock(&cache->seg_map_lock); +} + +static void defer_req(struct pcache_request *pcache_req) +{ + struct dm_pcache *pcache = pcache_req->pcache; + + BUG_ON(!list_empty(&pcache_req->list_node)); + + spin_lock(&pcache->defered_req_list_lock); + list_add(&pcache_req->list_node, &pcache->defered_req_list); + pcache_defer_reqs_kick(pcache); + spin_unlock(&pcache->defered_req_list_lock); +} + +static void defered_req_fn(struct work_struct *work) +{ + struct dm_pcache *pcache = container_of(work, struct dm_pcache, defered_req_work); + struct pcache_request *pcache_req; + LIST_HEAD(tmp_list); + int ret; + + if (pcache_is_stopping(pcache)) + return; + + spin_lock(&pcache->defered_req_list_lock); + list_splice_init(&pcache->defered_req_list, &tmp_list); + spin_unlock(&pcache->defered_req_list_lock); + + while (!list_empty(&tmp_list)) { + pcache_req = list_first_entry(&tmp_list, + struct pcache_request, list_node); + list_del_init(&pcache_req->list_node); + pcache_req->ret = 0; + ret = pcache_cache_handle_req(&pcache->cache, pcache_req); + if (ret == -EBUSY) + defer_req(pcache_req); + else + pcache_req_put(pcache_req, ret); + } +} + +void pcache_req_get(struct pcache_request *pcache_req) +{ + kref_get(&pcache_req->ref); +} + +static void end_req(struct kref *ref) +{ + struct pcache_request *pcache_req = container_of(ref, struct pcache_request, ref); + struct dm_pcache *pcache = pcache_req->pcache; + struct bio *bio = pcache_req->bio; + int ret = pcache_req->ret; + + if (ret == -EBUSY) { + pcache_req_get(pcache_req); + defer_req(pcache_req); + } else { + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + + if (atomic_dec_and_test(&pcache->inflight_reqs)) + wake_up(&pcache->inflight_wq); + } +} + +void pcache_req_put(struct pcache_request *pcache_req, int ret) +{ + /* Set the return status if it is not already set */ + if (ret && !pcache_req->ret) + pcache_req->ret = ret; + + kref_put(&pcache_req->ref, end_req); +} + +static bool at_least_one_arg(struct dm_arg_set *as, char **error) +{ + if (!as->argc) { + *error = "Insufficient args"; + return false; + } + + return true; +} + +static int parse_cache_dev(struct dm_pcache *pcache, struct dm_arg_set *as, + char **error) +{ + int ret; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + ret = dm_get_device(pcache->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, + &pcache->cache_dev.dm_dev); + if (ret) { + *error = "Error opening cache device"; + return ret; + } + + return 0; +} + +static int parse_backing_dev(struct dm_pcache *pcache, struct dm_arg_set *as, + char **error) +{ + int ret; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + ret = dm_get_device(pcache->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, + &pcache->backing_dev.dm_dev); + if (ret) { + *error = "Error opening backing device"; + return ret; + } + + return 0; +} + +static void pcache_init_opts(struct pcache_cache_options *opts) +{ + opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK; + opts->data_crc = false; +} + +static int parse_cache_opts(struct dm_pcache *pcache, struct dm_arg_set *as, + char **error) +{ + struct pcache_cache_options *opts = &pcache->opts; + static const struct dm_arg _args[] = { + {0, 4, "Invalid number of cache option arguments"}, + }; + unsigned int argc; + const char *arg; + int ret; + + pcache_init_opts(opts); + if (!as->argc) + return 0; + + ret = dm_read_arg_group(_args, as, &argc, error); + if (ret) + return -EINVAL; + + while (argc) { + arg = dm_shift_arg(as); + argc--; + + if (!strcmp(arg, "cache_mode")) { + arg = dm_shift_arg(as); + if (!strcmp(arg, "writeback")) { + opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK; + } else { + *error = "Invalid cache mode parameter"; + return -EINVAL; + } + argc--; + } else if (!strcmp(arg, "data_crc")) { + arg = dm_shift_arg(as); + if (!strcmp(arg, "true")) { + opts->data_crc = true; + } else if (!strcmp(arg, "false")) { + opts->data_crc = false; + } else { + *error = "Invalid data crc parameter"; + return -EINVAL; + } + argc--; + } else { + *error = "Unrecognised cache option requested"; + return -EINVAL; + } + } + + return 0; +} + +static int pcache_start(struct dm_pcache *pcache, char **error) +{ + int ret; + + ret = cache_dev_start(pcache); + if (ret) { + *error = "Failed to start cache dev"; + return ret; + } + + ret = backing_dev_start(pcache); + if (ret) { + *error = "Failed to start backing dev"; + goto stop_cache; + } + + ret = pcache_cache_start(pcache); + if (ret) { + *error = "Failed to start pcache"; + goto stop_backing; + } + + return 0; +stop_backing: + backing_dev_stop(pcache); +stop_cache: + cache_dev_stop(pcache); + + return ret; +} + +static void pcache_destroy_args(struct dm_pcache *pcache) +{ + if (pcache->cache_dev.dm_dev) + dm_put_device(pcache->ti, pcache->cache_dev.dm_dev); + if (pcache->backing_dev.dm_dev) + dm_put_device(pcache->ti, pcache->backing_dev.dm_dev); +} + +static int pcache_parse_args(struct dm_pcache *pcache, unsigned int argc, char **argv, + char **error) +{ + struct dm_arg_set as; + int ret; + + as.argc = argc; + as.argv = argv; + + /* + * Parse cache device + */ + ret = parse_cache_dev(pcache, &as, error); + if (ret) + return ret; + /* + * Parse backing device + */ + ret = parse_backing_dev(pcache, &as, error); + if (ret) + goto out; + /* + * Parse optional arguments + */ + ret = parse_cache_opts(pcache, &as, error); + if (ret) + goto out; + + return 0; +out: + pcache_destroy_args(pcache); + return ret; +} + +static int dm_pcache_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct mapped_device *md = ti->table->md; + struct dm_pcache *pcache; + int ret; + + if (md->map) { + ti->error = "Don't support table loading for live md"; + return -EOPNOTSUPP; + } + + /* Allocate memory for the cache structure */ + pcache = kzalloc(sizeof(struct dm_pcache), GFP_KERNEL); + if (!pcache) + return -ENOMEM; + + pcache->task_wq = alloc_workqueue("pcache-%s-wq", WQ_UNBOUND | WQ_MEM_RECLAIM, + 0, md->name); + if (!pcache->task_wq) { + ret = -ENOMEM; + goto free_pcache; + } + + spin_lock_init(&pcache->defered_req_list_lock); + INIT_LIST_HEAD(&pcache->defered_req_list); + INIT_WORK(&pcache->defered_req_work, defered_req_fn); + pcache->ti = ti; + + ret = pcache_parse_args(pcache, argc, argv, &ti->error); + if (ret) + goto destroy_wq; + + ret = pcache_start(pcache, &ti->error); + if (ret) + goto destroy_args; + + ti->num_flush_bios = 1; + ti->flush_supported = true; + ti->per_io_data_size = sizeof(struct pcache_request); + ti->private = pcache; + atomic_set(&pcache->inflight_reqs, 0); + atomic_set(&pcache->state, PCACHE_STATE_RUNNING); + init_waitqueue_head(&pcache->inflight_wq); + + return 0; +destroy_args: + pcache_destroy_args(pcache); +destroy_wq: + destroy_workqueue(pcache->task_wq); +free_pcache: + kfree(pcache); + + return ret; +} + +static void defer_req_stop(struct dm_pcache *pcache) +{ + struct pcache_request *pcache_req; + LIST_HEAD(tmp_list); + + flush_work(&pcache->defered_req_work); + + spin_lock(&pcache->defered_req_list_lock); + list_splice_init(&pcache->defered_req_list, &tmp_list); + spin_unlock(&pcache->defered_req_list_lock); + + while (!list_empty(&tmp_list)) { + pcache_req = list_first_entry(&tmp_list, + struct pcache_request, list_node); + list_del_init(&pcache_req->list_node); + pcache_req_put(pcache_req, -EIO); + } +} + +static void dm_pcache_dtr(struct dm_target *ti) +{ + struct dm_pcache *pcache; + + pcache = ti->private; + atomic_set(&pcache->state, PCACHE_STATE_STOPPING); + defer_req_stop(pcache); + + wait_event(pcache->inflight_wq, + atomic_read(&pcache->inflight_reqs) == 0); + + pcache_cache_stop(pcache); + backing_dev_stop(pcache); + cache_dev_stop(pcache); + + pcache_destroy_args(pcache); + drain_workqueue(pcache->task_wq); + destroy_workqueue(pcache->task_wq); + + kfree(pcache); +} + +static int dm_pcache_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct pcache_request *pcache_req = dm_per_bio_data(bio, sizeof(struct pcache_request)); + struct dm_pcache *pcache = ti->private; + int ret; + + pcache_req->pcache = pcache; + kref_init(&pcache_req->ref); + pcache_req->ret = 0; + pcache_req->bio = bio; + pcache_req->off = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; + pcache_req->data_len = bio->bi_iter.bi_size; + INIT_LIST_HEAD(&pcache_req->list_node); + atomic_inc(&pcache->inflight_reqs); + + ret = pcache_cache_handle_req(&pcache->cache, pcache_req); + if (ret == -EBUSY) + defer_req(pcache_req); + else + pcache_req_put(pcache_req, ret); + + return DM_MAPIO_SUBMITTED; +} + +static void dm_pcache_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + struct dm_pcache *pcache = ti->private; + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + struct pcache_cache *cache = &pcache->cache; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%x %u %u %u %u %x %u:%u %u:%u %u:%u", + cache_dev->sb_flags, + cache_dev->seg_num, + cache->n_segs, + bitmap_weight(cache->seg_map, cache->n_segs), + pcache_cache_get_gc_percent(cache), + cache->cache_info.flags, + cache->key_head.cache_seg->cache_seg_id, + cache->key_head.seg_off, + cache->dirty_tail.cache_seg->cache_seg_id, + cache->dirty_tail.seg_off, + cache->key_tail.cache_seg->cache_seg_id, + cache->key_tail.seg_off); + break; + case STATUSTYPE_TABLE: + DMEMIT("%s %s 4 cache_mode writeback crc %s", + cache_dev->dm_dev->name, + backing_dev->dm_dev->name, + cache_data_crc_on(cache) ? "true" : "false"); + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } +} + +static int dm_pcache_message(struct dm_target *ti, unsigned int argc, + char **argv, char *result, unsigned int maxlen) +{ + struct dm_pcache *pcache = ti->private; + unsigned long val; + + if (argc != 2) + goto err; + + if (!strcasecmp(argv[0], "gc_percent")) { + if (kstrtoul(argv[1], 10, &val)) + goto err; + + return pcache_cache_set_gc_percent(&pcache->cache, val); + } +err: + return -EINVAL; +} + +static struct target_type dm_pcache_target = { + .name = "pcache", + .version = {0, 1, 0}, + .module = THIS_MODULE, + .features = DM_TARGET_SINGLETON, + .ctr = dm_pcache_ctr, + .dtr = dm_pcache_dtr, + .map = dm_pcache_map_bio, + .status = dm_pcache_status, + .message = dm_pcache_message, +}; + +static int __init dm_pcache_init(void) +{ + int ret; + + ret = pcache_backing_init(); + if (ret) + goto err; + + ret = pcache_cache_init(); + if (ret) + goto backing_exit; + + ret = dm_register_target(&dm_pcache_target); + if (ret) + goto cache_exit; + return 0; + +cache_exit: + pcache_cache_exit(); +backing_exit: + pcache_backing_exit(); +err: + return ret; +} +module_init(dm_pcache_init); + +static void __exit dm_pcache_exit(void) +{ + dm_unregister_target(&dm_pcache_target); + pcache_cache_exit(); + pcache_backing_exit(); +} +module_exit(dm_pcache_exit); + +MODULE_DESCRIPTION("dm-pcache Persistent Cache for block device"); +MODULE_AUTHOR("Dongsheng Yang <dongsheng.yang@linux.dev>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-pcache/dm_pcache.h b/drivers/md/dm-pcache/dm_pcache.h new file mode 100644 index 000000000000..b4e06be0c0b9 --- /dev/null +++ b/drivers/md/dm-pcache/dm_pcache.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _DM_PCACHE_H +#define _DM_PCACHE_H +#include <linux/device-mapper.h> + +#include "../dm-core.h" + +#define CACHE_DEV_TO_PCACHE(cache_dev) (container_of(cache_dev, struct dm_pcache, cache_dev)) +#define BACKING_DEV_TO_PCACHE(backing_dev) (container_of(backing_dev, struct dm_pcache, backing_dev)) +#define CACHE_TO_PCACHE(cache) (container_of(cache, struct dm_pcache, cache)) + +#define PCACHE_STATE_RUNNING 1 +#define PCACHE_STATE_STOPPING 2 + +struct pcache_cache_dev; +struct pcache_backing_dev; +struct pcache_cache; +struct pcache_cache_options; +struct dm_pcache { + struct dm_target *ti; + struct pcache_cache_dev cache_dev; + struct pcache_backing_dev backing_dev; + struct pcache_cache cache; + struct pcache_cache_options opts; + + spinlock_t defered_req_list_lock; + struct list_head defered_req_list; + struct workqueue_struct *task_wq; + + struct work_struct defered_req_work; + + atomic_t state; + atomic_t inflight_reqs; + wait_queue_head_t inflight_wq; +}; + +static inline bool pcache_is_stopping(struct dm_pcache *pcache) +{ + return (atomic_read(&pcache->state) == PCACHE_STATE_STOPPING); +} + +#define pcache_dev_err(pcache, fmt, ...) \ + pcache_err("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__) +#define pcache_dev_info(pcache, fmt, ...) \ + pcache_info("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__) +#define pcache_dev_debug(pcache, fmt, ...) \ + pcache_debug("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__) + +struct pcache_request { + struct dm_pcache *pcache; + struct bio *bio; + + u64 off; + u32 data_len; + + struct kref ref; + int ret; + + struct list_head list_node; +}; + +void pcache_req_get(struct pcache_request *pcache_req); +void pcache_req_put(struct pcache_request *pcache_req, int ret); + +void pcache_defer_reqs_kick(struct dm_pcache *pcache); + +#endif /* _DM_PCACHE_H */ diff --git a/drivers/md/dm-pcache/pcache_internal.h b/drivers/md/dm-pcache/pcache_internal.h new file mode 100644 index 000000000000..b7a3319d2bd3 --- /dev/null +++ b/drivers/md/dm-pcache/pcache_internal.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_INTERNAL_H +#define _PCACHE_INTERNAL_H + +#include <linux/delay.h> +#include <linux/crc32c.h> + +#define pcache_err(fmt, ...) \ + pr_err("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__) +#define pcache_info(fmt, ...) \ + pr_info("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__) +#define pcache_debug(fmt, ...) \ + pr_debug("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__) + +#define PCACHE_KB (1024ULL) +#define PCACHE_MB (1024 * PCACHE_KB) + +/* Maximum number of metadata indices */ +#define PCACHE_META_INDEX_MAX 2 + +#define PCACHE_CRC_SEED 0x3B15A +/* + * struct pcache_meta_header - PCACHE metadata header structure + * @crc: CRC checksum for validating metadata integrity. + * @seq: Sequence number to track metadata updates. + * @version: Metadata version. + * @res: Reserved space for future use. + */ +struct pcache_meta_header { + __u32 crc; + __u8 seq; + __u8 version; + __u16 res; +}; + +/* + * pcache_meta_crc - Calculate CRC for the given metadata header. + * @header: Pointer to the metadata header. + * @meta_size: Size of the metadata structure. + * + * Returns the CRC checksum calculated by excluding the CRC field itself. + */ +static inline u32 pcache_meta_crc(struct pcache_meta_header *header, u32 meta_size) +{ + return crc32c(PCACHE_CRC_SEED, (void *)header + 4, meta_size - 4); +} + +/* + * pcache_meta_seq_after - Check if a sequence number is more recent, accounting for overflow. + * @seq1: First sequence number. + * @seq2: Second sequence number. + * + * Determines if @seq1 is more recent than @seq2 by calculating the signed + * difference between them. This approach allows handling sequence number + * overflow correctly because the difference wraps naturally, and any value + * greater than zero indicates that @seq1 is "after" @seq2. This method + * assumes 8-bit unsigned sequence numbers, where the difference wraps + * around if seq1 overflows past seq2. + * + * Returns: + * - true if @seq1 is more recent than @seq2, indicating it comes "after" + * - false otherwise. + */ +static inline bool pcache_meta_seq_after(u8 seq1, u8 seq2) +{ + return (s8)(seq1 - seq2) > 0; +} + +/* + * pcache_meta_find_latest - Find the latest valid metadata. + * @header: Pointer to the metadata header. + * @meta_size: Size of each metadata block. + * + * Finds the latest valid metadata by checking sequence numbers. If a + * valid entry with the highest sequence number is found, its pointer + * is returned. Returns NULL if no valid metadata is found. + */ +static inline void __must_check *pcache_meta_find_latest(struct pcache_meta_header *header, + u32 meta_size, u32 meta_max_size, + void *meta_ret) +{ + struct pcache_meta_header *meta, *latest = NULL; + u32 i, seq_latest = 0; + void *meta_addr; + + meta = meta_ret; + + for (i = 0; i < PCACHE_META_INDEX_MAX; i++) { + meta_addr = (void *)header + (i * meta_max_size); + if (copy_mc_to_kernel(meta, meta_addr, meta_size)) { + pcache_err("hardware memory error when copy meta"); + return ERR_PTR(-EIO); + } + + /* Skip if CRC check fails, which means corrupted */ + if (meta->crc != pcache_meta_crc(meta, meta_size)) + continue; + + /* Update latest if a more recent sequence is found */ + if (!latest || pcache_meta_seq_after(meta->seq, seq_latest)) { + seq_latest = meta->seq; + latest = meta_addr; + } + } + + if (!latest) + return NULL; + + if (copy_mc_to_kernel(meta_ret, latest, meta_size)) { + pcache_err("hardware memory error"); + return ERR_PTR(-EIO); + } + + return latest; +} + +#endif /* _PCACHE_INTERNAL_H */ diff --git a/drivers/md/dm-pcache/segment.c b/drivers/md/dm-pcache/segment.c new file mode 100644 index 000000000000..7e9818701445 --- /dev/null +++ b/drivers/md/dm-pcache/segment.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/dax.h> + +#include "pcache_internal.h" +#include "cache_dev.h" +#include "segment.h" + +int segment_copy_to_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off) +{ + struct iov_iter iter; + size_t copied; + void *src; + + iov_iter_bvec(&iter, ITER_DEST, &bio->bi_io_vec[bio->bi_iter.bi_idx], + bio_segments(bio), bio->bi_iter.bi_size); + iter.iov_offset = bio->bi_iter.bi_bvec_done; + if (bio_off) + iov_iter_advance(&iter, bio_off); + + src = segment->data + data_off; + copied = _copy_mc_to_iter(src, data_len, &iter); + if (copied != data_len) + return -EIO; + + return 0; +} + +int segment_copy_from_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off) +{ + struct iov_iter iter; + size_t copied; + void *dst; + + iov_iter_bvec(&iter, ITER_SOURCE, &bio->bi_io_vec[bio->bi_iter.bi_idx], + bio_segments(bio), bio->bi_iter.bi_size); + iter.iov_offset = bio->bi_iter.bi_bvec_done; + if (bio_off) + iov_iter_advance(&iter, bio_off); + + dst = segment->data + data_off; + copied = _copy_from_iter_flushcache(dst, data_len, &iter); + if (copied != data_len) + return -EIO; + pmem_wmb(); + + return 0; +} + +void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment, + struct pcache_segment_init_options *options) +{ + segment->seg_info = options->seg_info; + segment_info_set_type(segment->seg_info, options->type); + + segment->cache_dev = cache_dev; + segment->seg_id = options->seg_id; + segment->data_size = PCACHE_SEG_SIZE - options->data_off; + segment->data = CACHE_DEV_SEGMENT(cache_dev, options->seg_id) + options->data_off; +} diff --git a/drivers/md/dm-pcache/segment.h b/drivers/md/dm-pcache/segment.h new file mode 100644 index 000000000000..deca1ddcb02b --- /dev/null +++ b/drivers/md/dm-pcache/segment.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_SEGMENT_H +#define _PCACHE_SEGMENT_H + +#include <linux/bio.h> +#include <linux/bitfield.h> + +#include "pcache_internal.h" + +struct pcache_segment_info { + struct pcache_meta_header header; + __u32 flags; + __u32 next_seg; +}; + +#define PCACHE_SEG_INFO_FLAGS_HAS_NEXT BIT(0) + +#define PCACHE_SEG_INFO_FLAGS_TYPE_MASK GENMASK(4, 1) +#define PCACHE_SEGMENT_TYPE_CACHE_DATA 1 + +static inline bool segment_info_has_next(struct pcache_segment_info *seg_info) +{ + return (seg_info->flags & PCACHE_SEG_INFO_FLAGS_HAS_NEXT); +} + +static inline void segment_info_set_type(struct pcache_segment_info *seg_info, u8 type) +{ + seg_info->flags &= ~PCACHE_SEG_INFO_FLAGS_TYPE_MASK; + seg_info->flags |= FIELD_PREP(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, type); +} + +static inline u8 segment_info_get_type(struct pcache_segment_info *seg_info) +{ + return FIELD_GET(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, seg_info->flags); +} + +struct pcache_segment_pos { + struct pcache_segment *segment; /* Segment associated with the position */ + u32 off; /* Offset within the segment */ +}; + +struct pcache_segment_init_options { + u8 type; + u32 seg_id; + u32 data_off; + + struct pcache_segment_info *seg_info; +}; + +struct pcache_segment { + struct pcache_cache_dev *cache_dev; + + void *data; + u32 data_size; + u32 seg_id; + + struct pcache_segment_info *seg_info; +}; + +int segment_copy_to_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off); +int segment_copy_from_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off); + +static inline void segment_pos_advance(struct pcache_segment_pos *seg_pos, u32 len) +{ + BUG_ON(seg_pos->off + len > seg_pos->segment->data_size); + + seg_pos->off += len; +} + +void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment, + struct pcache_segment_init_options *options); +#endif /* _PCACHE_SEGMENT_H */ diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c index b49e10d76d03..f07e773d9cc0 100644 --- a/drivers/md/dm-ps-historical-service-time.c +++ b/drivers/md/dm-ps-historical-service-time.c @@ -541,8 +541,10 @@ static int __init dm_hst_init(void) { int r = dm_register_path_selector(&hst_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " HST_VERSION " loaded"); @@ -551,10 +553,7 @@ static int __init dm_hst_init(void) static void __exit dm_hst_exit(void) { - int r = dm_unregister_path_selector(&hst_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&hst_ps); } module_init(dm_hst_init); diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c index 461ee6b2044d..80415a045c68 100644 --- a/drivers/md/dm-ps-io-affinity.c +++ b/drivers/md/dm-ps-io-affinity.c @@ -116,7 +116,7 @@ static int ioa_create(struct path_selector *ps, unsigned int argc, char **argv) if (!s) return -ENOMEM; - s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *), + s->path_map = kcalloc(nr_cpu_ids, sizeof(struct path_info *), GFP_KERNEL); if (!s->path_map) goto free_selector; @@ -260,10 +260,7 @@ static int __init dm_ioa_init(void) static void __exit dm_ioa_exit(void) { - int ret = dm_unregister_path_selector(&ioa_ps); - - if (ret < 0) - DMERR("unregister failed %d", ret); + dm_unregister_path_selector(&ioa_ps); } module_init(dm_ioa_init); diff --git a/drivers/md/dm-ps-queue-length.c b/drivers/md/dm-ps-queue-length.c index e305f05ad1e5..9c68701ed7a4 100644 --- a/drivers/md/dm-ps-queue-length.c +++ b/drivers/md/dm-ps-queue-length.c @@ -260,8 +260,10 @@ static int __init dm_ql_init(void) { int r = dm_register_path_selector(&ql_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " QL_VERSION " loaded"); @@ -270,10 +272,7 @@ static int __init dm_ql_init(void) static void __exit dm_ql_exit(void) { - int r = dm_unregister_path_selector(&ql_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&ql_ps); } module_init(dm_ql_init); diff --git a/drivers/md/dm-ps-round-robin.c b/drivers/md/dm-ps-round-robin.c index d1745b123dc1..0c12f4073461 100644 --- a/drivers/md/dm-ps-round-robin.c +++ b/drivers/md/dm-ps-round-robin.c @@ -220,8 +220,10 @@ static int __init dm_rr_init(void) { int r = dm_register_path_selector(&rr_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " RR_VERSION " loaded"); @@ -230,10 +232,7 @@ static int __init dm_rr_init(void) static void __exit dm_rr_exit(void) { - int r = dm_unregister_path_selector(&rr_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&rr_ps); } module_init(dm_rr_init); diff --git a/drivers/md/dm-ps-service-time.c b/drivers/md/dm-ps-service-time.c index 969d31c40272..0543fe7969c4 100644 --- a/drivers/md/dm-ps-service-time.c +++ b/drivers/md/dm-ps-service-time.c @@ -341,8 +341,10 @@ static int __init dm_st_init(void) { int r = dm_register_path_selector(&st_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " ST_VERSION " loaded"); @@ -351,10 +353,7 @@ static int __init dm_st_init(void) static void __exit dm_st_exit(void) { - int r = dm_unregister_path_selector(&st_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&st_ps); } module_init(dm_st_init); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1e0d3b9b75d6..c6f7129e43d3 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -438,7 +438,7 @@ static bool rs_is_reshapable(struct raid_set *rs) /* Return true, if raid set in @rs is recovering */ static bool rs_is_recovering(struct raid_set *rs) { - return rs->md.recovery_cp < rs->md.dev_sectors; + return rs->md.resync_offset < rs->md.dev_sectors; } /* Return true, if raid set in @rs is reshaping */ @@ -768,7 +768,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r rs->md.layout = raid_type->algorithm; rs->md.new_layout = rs->md.layout; rs->md.delta_disks = 0; - rs->md.recovery_cp = MaxSector; + rs->md.resync_offset = MaxSector; for (i = 0; i < raid_devs; i++) md_rdev_init(&rs->dev[i].rdev); @@ -912,7 +912,7 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) rs->md.external = 0; rs->md.persistent = 1; rs->md.major_version = 2; - } else if (rebuild && !rs->md.recovery_cp) { + } else if (rebuild && !rs->md.resync_offset) { /* * Without metadata, we will not be able to tell if the array * is in-sync or not - we must assume it is not. Therefore, @@ -1355,11 +1355,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, return -EINVAL; } - /* - * In device-mapper, we specify things in sectors, but - * MD records this value in kB - */ - if (value < 0 || value / 2 > COUNTER_MAX) { + if (value < 0) { rs->ti->error = "Max write-behind limit out of range"; return -EINVAL; } @@ -1699,20 +1695,20 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) { /* raid0 does not recover */ if (rs_is_raid0(rs)) - rs->md.recovery_cp = MaxSector; + rs->md.resync_offset = MaxSector; /* * A raid6 set has to be recovered either * completely or for the grown part to * ensure proper parity and Q-Syndrome */ else if (rs_is_raid6(rs)) - rs->md.recovery_cp = dev_sectors; + rs->md.resync_offset = dev_sectors; /* * Other raid set types may skip recovery * depending on the 'nosync' flag. */ else - rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) + rs->md.resync_offset = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) ? MaxSector : dev_sectors; } @@ -2147,7 +2143,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev) sb->events = cpu_to_le64(mddev->events); sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); - sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); + sb->array_resync_offset = cpu_to_le64(mddev->resync_offset); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); @@ -2338,18 +2334,18 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) } if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) - mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); + mddev->resync_offset = le64_to_cpu(sb->array_resync_offset); /* * During load, we set FirstUse if a new superblock was written. * There are two reasons we might not have a superblock: * 1) The raid set is brand new - in which case, all of the * devices must have their In_sync bit set. Also, - * recovery_cp must be 0, unless forced. + * resync_offset must be 0, unless forced. * 2) This is a new device being added to an old raid set * and the new device needs to be rebuilt - in which * case the In_sync bit will /not/ be set and - * recovery_cp must be MaxSector. + * resync_offset must be MaxSector. * 3) This is/are a new device(s) being added to an old * raid set during takeover to a higher raid level * to provide capacity for redundancy or during reshape @@ -2394,8 +2390,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) new_devs > 1 ? "s" : ""); return -EINVAL; } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) { - DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", - (unsigned long long) mddev->recovery_cp); + DMERR("'rebuild' specified while raid set is not in-sync (resync_offset=%llu)", + (unsigned long long) mddev->resync_offset); return -EINVAL; } else if (rs_is_reshaping(rs)) { DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)", @@ -2410,7 +2406,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) */ sb_retrieve_failed_devices(sb, failed_devices); rdev_for_each(r, mddev) { - if (test_bit(Journal, &rdev->flags) || + if (test_bit(Journal, &r->flags) || !r->sb_page) continue; sb2 = page_address(r->sb_page); @@ -2535,6 +2531,10 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) struct md_rdev *rdev, *freshest; struct mddev *mddev = &rs->md; + /* Respect resynchronization requested with "sync" argument. */ + if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + freshest = NULL; rdev_for_each(rdev, mddev) { if (test_bit(Journal, &rdev->flags)) @@ -2700,11 +2700,11 @@ static int rs_adjust_data_offsets(struct raid_set *rs) } out: /* - * Raise recovery_cp in case data_offset != 0 to + * Raise resync_offset in case data_offset != 0 to * avoid false recovery positives in the constructor. */ - if (rs->md.recovery_cp < rs->md.dev_sectors) - rs->md.recovery_cp += rs->dev[0].rdev.data_offset; + if (rs->md.resync_offset < rs->md.dev_sectors) + rs->md.resync_offset += rs->dev[0].rdev.data_offset; /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ rdev_for_each(rdev, &rs->md) { @@ -2759,7 +2759,7 @@ static int rs_setup_takeover(struct raid_set *rs) } clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags); - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; while (d--) { rdev = &rs->dev[d].rdev; @@ -2767,7 +2767,7 @@ static int rs_setup_takeover(struct raid_set *rs) if (test_bit(d, (void *) rs->rebuild_disks)) { clear_bit(In_sync, &rdev->flags); clear_bit(Faulty, &rdev->flags); - mddev->recovery_cp = rdev->recovery_offset = 0; + mddev->resync_offset = rdev->recovery_offset = 0; /* Bitmap has to be created when we do an "up" takeover */ set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); } @@ -3196,7 +3196,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (reshape_sectors || rs_is_raid1(rs)) { /* * We can only prepare for a reshape here, because the - * raid set needs to run to provide the repective reshape + * raid set needs to run to provide the respective reshape * check functions via its MD personality instance. * * So do the reshape check after md_run() succeeded. @@ -3225,7 +3225,7 @@ size_check: if (r) goto bad; - rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors); + rs_setup_recovery(rs, rs->md.resync_offset < rs->md.dev_sectors ? rs->md.resync_offset : rs->md.dev_sectors); } else { /* This is no size change or it is shrinking, update size and record in superblocks */ r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); @@ -3247,7 +3247,7 @@ size_check: rs_reset_inconclusive_reshape(rs); /* Start raid set read-only and assumed clean to change in raid_resume() */ - rs->md.ro = 1; + rs->md.ro = MD_RDONLY; rs->md.in_sync = 1; /* Has to be held on running the array */ @@ -3308,6 +3308,7 @@ size_check: /* Disable/enable discard support on raid set. */ configure_discard_support(rs); + rs->md.dm_gendisk = dm_disk(dm_table_get_md(ti->table)); mddev_unlock(&rs->md); return 0; @@ -3327,6 +3328,7 @@ static void raid_dtr(struct dm_target *ti) mddev_lock_nointr(&rs->md); md_stop(&rs->md); + rs->md.dm_gendisk = NULL; mddev_unlock(&rs->md); if (work_pending(&rs->md.event_work)) @@ -3383,7 +3385,7 @@ static enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long r /* The MD sync thread can be done with io or be interrupted but still be running */ if (!test_bit(MD_RECOVERY_DONE, &recovery) && (test_bit(MD_RECOVERY_RUNNING, &recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) { + (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery)))) { if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return st_reshape; @@ -3447,7 +3449,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, } else { if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery)) - r = mddev->recovery_cp; + r = mddev->resync_offset; else r = mddev->curr_resync_completed; @@ -3773,11 +3775,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, } else return -EINVAL; } - if (mddev->ro == 2) { + if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ - mddev->ro = 0; + mddev->ro = MD_RDWR; if (!mddev->suspended) md_wakeup_thread(mddev->sync_thread); } @@ -3811,8 +3813,10 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) struct raid_set *rs = ti->private; unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors); - limits->io_min = chunk_size_bytes; - limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + if (chunk_size_bytes) { + limits->io_min = chunk_size_bytes; + limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + } } static void raid_presuspend(struct dm_target *ti) @@ -3856,6 +3860,7 @@ static void raid_postsuspend(struct dm_target *ti) */ md_stop_writes(&rs->md); mddev_suspend(&rs->md, false); + rs->md.ro = MD_RDONLY; } } @@ -3951,9 +3956,11 @@ static int __load_dirty_region_bitmap(struct raid_set *rs) !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { struct mddev *mddev = &rs->md; - r = mddev->bitmap_ops->load(mddev); - if (r) - DMERR("Failed to load bitmap"); + if (md_bitmap_enabled(mddev, false)) { + r = mddev->bitmap_ops->load(mddev); + if (r) + DMERR("Failed to load bitmap"); + } } return r; @@ -3966,7 +3973,7 @@ static void rs_update_sbs(struct raid_set *rs) int ro = mddev->ro; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - mddev->ro = 0; + mddev->ro = MD_RDWR; md_update_sb(mddev, 1); mddev->ro = ro; } @@ -4068,16 +4075,18 @@ static int raid_preresume(struct dm_target *ti) mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; - r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, - chunksize, false); - if (r) - DMERR("Failed to resize bitmap"); + if (md_bitmap_enabled(mddev, false)) { + r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, + chunksize); + if (r) + DMERR("Failed to resize bitmap"); + } } /* Check for any resize/reshape on @rs and adjust/initiate */ - if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset && mddev->resync_offset < MaxSector) { set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - mddev->resync_min = mddev->recovery_cp; + mddev->resync_min = mddev->resync_offset; if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) mddev->resync_max_sectors = mddev->dev_sectors; } @@ -4123,7 +4132,7 @@ static void raid_resume(struct dm_target *ti) WARN_ON_ONCE(rcu_dereference_protected(mddev->sync_thread, lockdep_is_held(&mddev->reconfig_mutex))); clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); - mddev->ro = 0; + mddev->ro = MD_RDWR; mddev->in_sync = 0; md_unfrozen_sync_thread(mddev); mddev_unlock_and_resume(mddev); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9511dae5b556..268f734ca9c3 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -103,7 +103,7 @@ static void wakeup_mirrord(void *context) static void delayed_wake_fn(struct timer_list *t) { - struct mirror_set *ms = from_timer(ms, t, timer); + struct mirror_set *ms = timer_container_of(ms, t, timer); clear_bit(0, &ms->timer_pending); wakeup_mirrord(ms); @@ -133,10 +133,9 @@ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) spin_lock_irqsave(&ms->lock, flags); should_wake = !(bl->head); bio_list_add(bl, bio); - spin_unlock_irqrestore(&ms->lock, flags); - if (should_wake) wakeup_mirrord(ms); + spin_unlock_irqrestore(&ms->lock, flags); } static void dispatch_bios(void *context, struct bio_list *bio_list) @@ -646,9 +645,9 @@ static void write_callback(unsigned long error, void *context) if (!ms->failures.head) should_wake = 1; bio_list_add(&ms->failures, bio); - spin_unlock_irqrestore(&ms->lock, flags); if (should_wake) wakeup_mirrord(ms); + spin_unlock_irqrestore(&ms->lock, flags); } static void do_write(struct mirror_set *ms, struct bio *bio) @@ -656,7 +655,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) unsigned int i; struct dm_io_region io[MAX_NR_MIRRORS], *dest = io; struct mirror *m; - blk_opf_t op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH); + blk_opf_t op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH | REQ_ATOMIC); struct dm_io_request io_req = { .bi_opf = REQ_OP_WRITE | op_flags, .mem.type = DM_IO_BIO, @@ -1182,7 +1181,7 @@ static void mirror_dtr(struct dm_target *ti) { struct mirror_set *ms = ti->private; - del_timer_sync(&ms->timer); + timer_delete_sync(&ms->timer); flush_workqueue(ms->kmirrord_wq); flush_work(&ms->trigger_event); dm_kcopyd_client_destroy(ms->kcopyd_client); @@ -1483,8 +1482,9 @@ static int mirror_iterate_devices(struct dm_target *ti, static struct target_type mirror_target = { .name = "mirror", - .version = {1, 14, 0}, + .version = {1, 15, 0}, .module = THIS_MODULE, + .features = DM_TARGET_ATOMIC_WRITES, .ctr = mirror_ctr, .dtr = mirror_dtr, .map = mirror_map, diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index a4550975c27d..e9b47b659976 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -206,7 +206,7 @@ struct dm_region_hash *dm_region_hash_create( rh->shift = RH_HASH_SHIFT; rh->prime = RH_HASH_MULT; - rh->buckets = vmalloc(array_size(nr_buckets, sizeof(*rh->buckets))); + rh->buckets = vmalloc_array(nr_buckets, sizeof(*rh->buckets)); if (!rh->buckets) { DMERR("unable to allocate region hash bucket memory"); kfree(rh); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 499f8cc8a39f..a6ca92049c10 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -217,10 +217,10 @@ static void dm_done(struct request *clone, blk_status_t error, bool mapped) if (unlikely(error == BLK_STS_TARGET)) { if (req_op(clone) == REQ_OP_DISCARD && !clone->q->limits.max_discard_sectors) - disable_discard(tio->md); + blk_queue_disable_discard(tio->md->queue); else if (req_op(clone) == REQ_OP_WRITE_ZEROES && !clone->q->limits.max_write_zeroes_sectors) - disable_write_zeroes(tio->md); + blk_queue_disable_write_zeroes(tio->md->queue); } switch (r) { @@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) md->tag_set->ops = &dm_mq_ops; md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); md->tag_set->numa_node = md->numa_node_id; - md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; + md->tag_set->flags = BLK_MQ_F_STACKING; md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); md->tag_set->driver_data = md; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 4112071de0be..1461dc740dae 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -316,7 +316,7 @@ static struct dax_device *stripe_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, - pfn_t *pfn) + unsigned long *pfn) { struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); @@ -405,7 +405,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) { unsigned int i; - char major_minor[16]; + char major_minor[22]; struct stripe_c *sc = ti->private; if (!*error) @@ -417,8 +417,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, if (*error == BLK_STS_NOTSUPP) return DM_ENDIO_DONE; - memset(major_minor, 0, sizeof(major_minor)); - sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio))); + format_dev_t(major_minor, bio_dev(bio)); /* * Test to see which stripe drive triggered the event @@ -457,16 +456,22 @@ static void stripe_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct stripe_c *sc = ti->private; - unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT; + unsigned int io_min, io_opt; - limits->io_min = chunk_size; - limits->io_opt = chunk_size * sc->stripes; + limits->chunk_sectors = sc->chunk_size; + + if (!check_shl_overflow(sc->chunk_size, SECTOR_SHIFT, &io_min) && + !check_mul_overflow(io_min, sc->stripes, &io_opt)) { + limits->io_min = io_min; + limits->io_opt = io_opt; + } } static struct target_type stripe_target = { .name = "striped", - .version = {1, 6, 0}, - .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT, + .version = {1, 7, 0}, + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | + DM_TARGET_ATOMIC_WRITES | DM_TARGET_PASSES_CRYPTO, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index dfd9fb52a6f3..50a52ca50b34 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -114,8 +114,8 @@ static int alloc_region_table(struct dm_target *ti, unsigned int nr_paths) return -EINVAL; } - sctx->region_table = vmalloc(array_size(nr_slots, - sizeof(region_table_slot_t))); + sctx->region_table = vmalloc_array(nr_slots, + sizeof(region_table_slot_t)); if (!sctx->region_table) { ti->error = "Cannot allocate region table"; return -ENOMEM; @@ -517,7 +517,9 @@ static void switch_status(struct dm_target *ti, status_type_t type, * * Passthrough all ioctls to the path for sector 0 */ -static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct switch_ctx *sctx = ti->private; unsigned int path_nr; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index bd8b796ae683..ad0a60a07b93 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -117,7 +117,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num) n_targets = (struct dm_target *) (n_highs + num); memset(n_highs, -1, sizeof(*n_highs) * num); - kvfree(t->highs); t->num_allocated = num; t->highs = n_highs; @@ -257,7 +256,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, if (bdev_is_zoned(bdev)) { unsigned int zone_sectors = bdev_zone_sectors(bdev); - if (start & (zone_sectors - 1)) { + if (!bdev_is_zone_aligned(bdev, start)) { DMERR("%s: start=%llu not aligned to h/w zone size %u of %pg", dm_device_name(ti->table->md), (unsigned long long)start, @@ -274,7 +273,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, * devices do not end up with a smaller zone in the middle of * the sector range. */ - if (len & (zone_sectors - 1)) { + if (!bdev_is_zone_aligned(bdev, len)) { DMERR("%s: len=%llu not aligned to h/w zone size %u of %pg", dm_device_name(ti->table->md), (unsigned long long)len, @@ -431,6 +430,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, return 0; } + mutex_lock(&q->limits_lock); + /* + * BLK_FEAT_ATOMIC_WRITES is not inherited from the bottom device in + * blk_stack_limits(), so do it manually. + */ + limits->features |= (q->limits.features & BLK_FEAT_ATOMIC_WRITES); + if (blk_stack_limits(limits, &q->limits, get_start_sect(bdev) + start) < 0) DMWARN("%s: adding target device %pg caused an alignment inconsistency: " @@ -448,6 +454,7 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, */ if (!dm_target_has_integrity(ti->type)) queue_limits_stack_integrity_bdev(limits, bdev); + mutex_unlock(&q->limits_lock); return 0; } @@ -523,8 +530,9 @@ static char **realloc_argv(unsigned int *size, char **old_argv) gfp = GFP_NOIO; } argv = kmalloc_array(new_size, sizeof(*argv), gfp); - if (argv && old_argv) { - memcpy(argv, old_argv, *size * sizeof(*argv)); + if (argv) { + if (old_argv) + memcpy(argv, old_argv, *size * sizeof(*argv)); *size = new_size; } @@ -697,6 +705,10 @@ int dm_table_add_target(struct dm_table *t, const char *type, DMERR("%s: zero-length target", dm_device_name(t->md)); return -EINVAL; } + if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) { + DMERR("%s: too large device", dm_device_name(t->md)); + return -EINVAL; + } ti->type = dm_get_target_type(type); if (!ti->type) { @@ -887,17 +899,17 @@ static bool dm_table_supports_dax(struct dm_table *t, return true; } -static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_is_not_rq_stackable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct block_device *bdev = dev->bdev; struct request_queue *q = bdev_get_queue(bdev); /* request-based cannot stack on partitions! */ if (bdev_is_partition(bdev)) - return false; + return true; - return queue_is_mq(q); + return !queue_is_mq(q); } static int dm_table_determine_type(struct dm_table *t) @@ -993,7 +1005,7 @@ verify_rq_based: /* Non-request-stackable devices can't be used for request-based dm */ if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_is_rq_stackable, NULL)) { + ti->type->iterate_devices(ti, device_is_not_rq_stackable, NULL)) { DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } @@ -1045,7 +1057,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * unsigned int min_pool_size = 0, pool_size; struct dm_md_mempools *pools; unsigned int bioset_flags = 0; - bool mempool_needs_integrity = t->integrity_supported; if (unlikely(type == DM_TYPE_NONE)) { DMERR("no table type is set, can't allocate mempools"); @@ -1070,8 +1081,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * per_io_data_size = max(per_io_data_size, ti->per_io_data_size); min_pool_size = max(min_pool_size, ti->num_flush_bios); - - mempool_needs_integrity |= ti->mempool_needs_integrity; } pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, @@ -1081,15 +1090,9 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->io_bs, pool_size)) - goto out_free_pools; init_bs: if (bioset_init(&pools->bs, pool_size, front_pad, 0)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->bs, pool_size)) - goto out_free_pools; t->mempools = pools; return 0; @@ -1177,7 +1180,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, t = dm_get_live_table(md, &srcu_idx); if (!t) - return 0; + goto put_live_table; for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); @@ -1188,10 +1191,181 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, (void *)key); } +put_live_table: dm_put_live_table(md, srcu_idx); return 0; } +enum dm_wrappedkey_op { + DERIVE_SW_SECRET, + IMPORT_KEY, + GENERATE_KEY, + PREPARE_KEY, +}; + +struct dm_wrappedkey_op_args { + enum dm_wrappedkey_op op; + int err; + union { + struct { + const u8 *eph_key; + size_t eph_key_size; + u8 *sw_secret; + } derive_sw_secret; + struct { + const u8 *raw_key; + size_t raw_key_size; + u8 *lt_key; + } import_key; + struct { + u8 *lt_key; + } generate_key; + struct { + const u8 *lt_key; + size_t lt_key_size; + u8 *eph_key; + } prepare_key; + }; +}; + +static int dm_wrappedkey_op_callback(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_wrappedkey_op_args *args = data; + struct block_device *bdev = dev->bdev; + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + int err = -EOPNOTSUPP; + + if (!args->err) + return 0; + + switch (args->op) { + case DERIVE_SW_SECRET: + err = blk_crypto_derive_sw_secret( + bdev, + args->derive_sw_secret.eph_key, + args->derive_sw_secret.eph_key_size, + args->derive_sw_secret.sw_secret); + break; + case IMPORT_KEY: + err = blk_crypto_import_key(profile, + args->import_key.raw_key, + args->import_key.raw_key_size, + args->import_key.lt_key); + break; + case GENERATE_KEY: + err = blk_crypto_generate_key(profile, + args->generate_key.lt_key); + break; + case PREPARE_KEY: + err = blk_crypto_prepare_key(profile, + args->prepare_key.lt_key, + args->prepare_key.lt_key_size, + args->prepare_key.eph_key); + break; + } + args->err = err; + + /* Try another device in case this fails. */ + return 0; +} + +static int dm_exec_wrappedkey_op(struct blk_crypto_profile *profile, + struct dm_wrappedkey_op_args *args) +{ + struct mapped_device *md = + container_of(profile, struct dm_crypto_profile, profile)->md; + struct dm_target *ti; + struct dm_table *t; + int srcu_idx; + int i; + + args->err = -EOPNOTSUPP; + + t = dm_get_live_table(md, &srcu_idx); + if (!t) + goto out; + + /* + * blk-crypto currently has no support for multiple incompatible + * implementations of wrapped inline crypto keys on a single system. + * It was already checked earlier that support for wrapped keys was + * declared on all underlying devices. Thus, all the underlying devices + * should support all wrapped key operations and they should behave + * identically, i.e. work with the same keys. So, just executing the + * operation on the first device on which it works suffices for now. + */ + for (i = 0; i < t->num_targets; i++) { + ti = dm_table_get_target(t, i); + if (!ti->type->iterate_devices) + continue; + ti->type->iterate_devices(ti, dm_wrappedkey_op_callback, args); + if (!args->err) + break; + } +out: + dm_put_live_table(md, srcu_idx); + return args->err; +} + +static int dm_derive_sw_secret(struct blk_crypto_profile *profile, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = DERIVE_SW_SECRET, + .derive_sw_secret = { + .eph_key = eph_key, + .eph_key_size = eph_key_size, + .sw_secret = sw_secret, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + +static int dm_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = IMPORT_KEY, + .import_key = { + .raw_key = raw_key, + .raw_key_size = raw_key_size, + .lt_key = lt_key, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + +static int dm_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = GENERATE_KEY, + .generate_key = { + .lt_key = lt_key, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + +static int dm_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = PREPARE_KEY, + .prepare_key = { + .lt_key = lt_key, + .lt_key_size = lt_key_size, + .eph_key = eph_key, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + static int device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) @@ -1250,6 +1424,7 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) profile->max_dun_bytes_supported = UINT_MAX; memset(profile->modes_supported, 0xFF, sizeof(profile->modes_supported)); + profile->key_types_supported = ~0; for (i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); @@ -1265,6 +1440,13 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) profile); } + if (profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED) { + profile->ll_ops.derive_sw_secret = dm_derive_sw_secret; + profile->ll_ops.import_key = dm_import_key; + profile->ll_ops.generate_key = dm_generate_key; + profile->ll_ops.prepare_key = dm_prepare_key; + } + if (t->md->queue && !blk_crypto_has_capabilities(profile, t->md->queue->crypto_profile)) { @@ -1492,6 +1674,18 @@ bool dm_table_has_no_data_devices(struct dm_table *t) return true; } +bool dm_table_is_wildcard(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_is_wildcard(ti->type)) + return false; + } + + return true; +} + static int device_not_zoned(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1723,8 +1917,12 @@ static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev * sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); + int b; - return !q->limits.max_write_zeroes_sectors; + mutex_lock(&q->limits_lock); + b = !q->limits.max_write_zeroes_sectors; + mutex_unlock(&q->limits_lock); + return b; } static bool dm_table_supports_write_zeroes(struct dm_table *t) @@ -1806,10 +2004,50 @@ static bool dm_table_supports_secure_erase(struct dm_table *t) return true; } +static int device_not_atomic_write_capable(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + return !bdev_can_atomic_write(dev->bdev); +} + +static bool dm_table_supports_atomic_writes(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_supports_atomic_writes(ti->type)) + return false; + + if (!ti->type->iterate_devices) + return false; + + if (ti->type->iterate_devices(ti, + device_not_atomic_write_capable, NULL)) { + return false; + } + } + return true; +} + +bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size, + sector_t new_size) +{ + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && dm_has_zone_plugs(t->md) && + old_size != new_size) { + DMWARN("%s: device has zone write plug resources. " + "Cannot change size", + dm_device_name(t->md)); + return false; + } + return true; +} + int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { int r; + struct queue_limits old_limits; if (!dm_table_supports_nowait(t)) limits->features &= ~BLK_FEAT_NOWAIT; @@ -1827,8 +2065,10 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, limits->discard_alignment = 0; } - if (!dm_table_supports_write_zeroes(t)) + if (!dm_table_supports_write_zeroes(t)) { limits->max_write_zeroes_sectors = 0; + limits->max_hw_wzeroes_unmap_sectors = 0; + } if (!dm_table_supports_secure_erase(t)) limits->max_secure_erase_sectors = 0; @@ -1836,25 +2076,30 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (dm_table_supports_flush(t)) limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; - if (dm_table_supports_dax(t, device_not_dax_capable)) { + if (dm_table_supports_dax(t, device_not_dax_capable)) limits->features |= BLK_FEAT_DAX; - if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) - set_dax_synchronous(t->md->dax_dev); - } else + else limits->features &= ~BLK_FEAT_DAX; - if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) - dax_write_cache(t->md->dax_dev, true); - /* For a zoned table, setup the zone related queue attributes. */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - (limits->features & BLK_FEAT_ZONED)) { - r = dm_set_zones_restrictions(t, q, limits); - if (r) - return r; + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + if (limits->features & BLK_FEAT_ZONED) { + r = dm_set_zones_restrictions(t, q, limits); + if (r) + return r; + } else if (dm_has_zone_plugs(t->md)) { + DMWARN("%s: device has zone write plug resources. " + "Cannot switch to non-zoned table.", + dm_device_name(t->md)); + return -EINVAL; + } } - r = queue_limits_set(q, limits); + if (dm_table_supports_atomic_writes(t)) + limits->features |= BLK_FEAT_ATOMIC_WRITES; + + old_limits = queue_limits_start_update(q); + r = queue_limits_commit_update(q, limits); if (r) return r; @@ -1865,10 +2110,21 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (limits->features & BLK_FEAT_ZONED)) { r = dm_revalidate_zones(t, q); - if (r) + if (r) { + queue_limits_set(q, &old_limits); return r; + } } + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + dm_finalize_zone_settings(t, limits); + + if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) + set_dax_synchronous(t->md->dax_dev); + + if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) + dax_write_cache(t->md->dax_dev, true); + dm_update_crypto_profile(q, t); return 0; } diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 652627aea11b..8fede41adec0 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -255,7 +255,7 @@ static void io_err_io_hints(struct dm_target *ti, struct queue_limits *limits) static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, - pfn_t *pfn) + unsigned long *pfn) { return -EIO; } @@ -263,7 +263,8 @@ static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, static struct target_type error_target = { .name = "error", .version = {1, 7, 0}, - .features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM, + .features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM | + DM_TARGET_PASSES_INTEGRITY, .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index bf0f9dddd146..c84149ba4e38 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2332,10 +2332,9 @@ static struct thin_c *get_first_thin(struct pool *pool) struct thin_c *tc = NULL; rcu_read_lock(); - if (!list_empty(&pool->active_thins)) { - tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list); + tc = list_first_or_null_rcu(&pool->active_thins, struct thin_c, list); + if (tc) thin_get(tc); - } rcu_read_unlock(); return tc; @@ -3032,8 +3031,8 @@ static struct pool *pool_create(struct mapped_device *pool_md, } pool->cell_sort_array = - vmalloc(array_size(CELL_SORT_ARRAY_SIZE, - sizeof(*pool->cell_sort_array))); + vmalloc_array(CELL_SORT_ARRAY_SIZE, + sizeof(*pool->cell_sort_array)); if (!pool->cell_sort_array) { *error = "Error allocating cell sort array"; err_p = ERR_PTR(-ENOMEM); @@ -4112,8 +4111,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | - DM_TARGET_IMMUTABLE, - .version = {1, 23, 0}, + DM_TARGET_IMMUTABLE | DM_TARGET_PASSES_CRYPTO, + .version = {1, 24, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4498,7 +4497,8 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 23, 0}, + .features = DM_TARGET_PASSES_CRYPTO, + .version = {1, 24, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c index 89cb7942ec5c..baf683cabb1b 100644 --- a/drivers/md/dm-vdo/block-map.c +++ b/drivers/md/dm-vdo/block-map.c @@ -451,7 +451,7 @@ static struct page_info * __must_check find_page(struct vdo_page_cache *cache, * select_lru_page() - Determine which page is least recently used. * * Picks the least recently used from among the non-busy entries at the front of each of the lru - * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely + * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely * that the entries at the front are busy unless the queue is very short, but not impossible. * * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be @@ -1544,7 +1544,7 @@ static void write_page_if_not_dirtied(struct vdo_waiter *waiter, void *context) static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio) { - return_vio_to_pool(zone->vio_pool, vio); + return_vio_to_pool(vio); check_for_drain_complete(zone); } @@ -1837,7 +1837,7 @@ static void finish_block_map_page_load(struct vdo_completion *completion) if (!vdo_copy_valid_page(vio->data, nonce, pbn, page)) vdo_format_block_map_page(page, nonce, pbn, false); - return_vio_to_pool(zone->vio_pool, pooled); + return_vio_to_pool(pooled); /* Release our claim to the load and wake any waiters */ release_page_lock(data_vio, "load"); @@ -1851,10 +1851,9 @@ static void handle_io_error(struct vdo_completion *completion) struct vio *vio = as_vio(completion); struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio); struct data_vio *data_vio = completion->parent; - struct block_map_zone *zone = pooled->context; vio_record_metadata_io_error(vio); - return_vio_to_pool(zone->vio_pool, pooled); + return_vio_to_pool(pooled); abort_load(data_vio, result); } @@ -2499,7 +2498,7 @@ static void finish_cursor(struct cursor *cursor) struct cursors *cursors = cursor->parent; struct vdo_completion *completion = cursors->completion; - return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio)); + return_vio_to_pool(vdo_forget(cursor->vio)); if (--cursors->active_roots > 0) return; @@ -2746,7 +2745,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map, if (result != VDO_SUCCESS) return result; - result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, + result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, 1, zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR, VIO_PRIORITY_METADATA, zone, &zone->vio_pool); if (result != VDO_SUCCESS) diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h index a8c4d6e24b38..2a8b03779f87 100644 --- a/drivers/md/dm-vdo/constants.h +++ b/drivers/md/dm-vdo/constants.h @@ -44,9 +44,6 @@ enum { /* The default size of each slab journal, in blocks */ DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224, - /* Unit test minimum */ - MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2, - /* * The initial size of lbn_operations and pbn_operations, which is based upon the expected * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c index 810002747091..262e11581f2d 100644 --- a/drivers/md/dm-vdo/data-vio.c +++ b/drivers/md/dm-vdo/data-vio.c @@ -17,6 +17,7 @@ #include <linux/minmax.h> #include <linux/sched.h> #include <linux/spinlock.h> +#include <linux/string.h> #include <linux/wait.h> #include "logger.h" @@ -509,18 +510,6 @@ static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lb vdo_enqueue_completion(completion, VDO_DEFAULT_Q_MAP_BIO_PRIORITY); } -static bool is_zero_block(char *block) -{ - int i; - - for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64)) { - if (*((u64 *) &block[i])) - return false; - } - - return true; -} - static void copy_from_bio(struct bio *bio, char *data_ptr) { struct bio_vec biovec; @@ -572,7 +561,7 @@ static void launch_bio(struct vdo *vdo, struct data_vio *data_vio, struct bio *b * we acknowledge the bio. */ copy_from_bio(bio, data_vio->vio.data); - data_vio->is_zero = is_zero_block(data_vio->vio.data); + data_vio->is_zero = mem_is_zero(data_vio->vio.data, VDO_BLOCK_SIZE); data_vio->write = true; } @@ -1459,7 +1448,7 @@ static void modify_for_partial_write(struct vdo_completion *completion) copy_from_bio(bio, data + data_vio->offset); } - data_vio->is_zero = is_zero_block(data); + data_vio->is_zero = mem_is_zero(data, VDO_BLOCK_SIZE); data_vio->read = false; launch_data_vio_logical_callback(data_vio, continue_data_vio_with_block_map_slot); diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index b6f8e2dc7729..4d983092a152 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -226,7 +226,7 @@ struct hash_lock { * A list containing the data VIOs sharing this lock, all having the same record name and * data block contents, linked by their hash_lock_node fields. */ - struct list_head duplicate_ring; + struct list_head duplicate_vios; /* The number of data_vios sharing this lock instance */ data_vio_count_t reference_count; @@ -343,7 +343,7 @@ static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *l { memset(lock, 0, sizeof(*lock)); INIT_LIST_HEAD(&lock->pool_node); - INIT_LIST_HEAD(&lock->duplicate_ring); + INIT_LIST_HEAD(&lock->duplicate_vios); vdo_waitq_init(&lock->waiters); list_add_tail(&lock->pool_node, &zone->lock_pool); } @@ -441,7 +441,7 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL, "must have a hash zone when holding a hash lock"); VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry), - "must be on a hash lock ring when holding a hash lock"); + "must be on a hash lock list when holding a hash lock"); VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0, "hash lock reference must be counted"); @@ -464,10 +464,10 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) if (new_lock != NULL) { /* - * Keep all data_vios sharing the lock on a ring since they can complete in any + * Keep all data_vios sharing the lock on a list since they can complete in any * order and we'll always need a pointer to one to compare data. */ - list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring); + list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_vios); new_lock->reference_count += 1; if (new_lock->max_references < new_lock->reference_count) new_lock->max_references = new_lock->reference_count; @@ -1789,10 +1789,10 @@ static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate struct hash_zone *zone; bool collides; - if (list_empty(&lock->duplicate_ring)) + if (list_empty(&lock->duplicate_vios)) return false; - lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio, + lock_holder = list_first_entry(&lock->duplicate_vios, struct data_vio, hash_lock_entry); zone = candidate->hash_zone; collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data); @@ -1815,7 +1815,7 @@ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio return result; result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry), - "must not already be a member of a hash lock ring"); + "must not already be a member of a hash lock list"); if (result != VDO_SUCCESS) return result; @@ -1942,8 +1942,8 @@ void vdo_release_hash_lock(struct data_vio *data_vio) "returned hash lock must not be in use with state %s", get_hash_lock_state_name(lock->state)); VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node), - "hash lock returned to zone must not be in a pool ring"); - VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring), + "hash lock returned to zone must not be in a pool list"); + VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_vios), "hash lock returned to zone must not reference DataVIOs"); return_hash_lock_to_pool(zone, lock); @@ -2178,6 +2178,7 @@ static int initialize_index(struct vdo *vdo, struct hash_zones *zones) vdo_set_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval); vdo_set_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval); + spin_lock_init(&zones->lock); /* * Since we will save up the timeouts that would have been reported but were ratelimited, @@ -2260,7 +2261,7 @@ static void check_for_drain_complete(struct hash_zone *zone) if ((atomic_read(&zone->timer_state) == DEDUPE_QUERY_TIMER_IDLE) || change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING, DEDUPE_QUERY_TIMER_IDLE)) { - del_timer_sync(&zone->timer); + timer_delete_sync(&zone->timer); } else { /* * There is an in flight time-out, which must get processed before we can continue. @@ -2336,7 +2337,7 @@ static void timeout_index_operations_callback(struct vdo_completion *completion) static void timeout_index_operations(struct timer_list *t) { - struct hash_zone *zone = from_timer(zone, t, timer); + struct hash_zone *zone = timer_container_of(zone, t, timer); if (change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING, DEDUPE_QUERY_TIMER_FIRED)) diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index 100e92f8f866..b7cc0f41caca 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -711,24 +711,11 @@ int vdo_configure_slab(block_count_t slab_size, block_count_t slab_journal_block ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks); meta_blocks = (ref_blocks + slab_journal_blocks); - /* Make sure test code hasn't configured slabs to be too small. */ + /* Make sure configured slabs are not too small. */ if (meta_blocks >= slab_size) return VDO_BAD_CONFIGURATION; - /* - * If the slab size is very small, assume this must be a unit test and override the number - * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their - * data_blocks fields to be the exact capacity of the configured volume, and that used to - * fall out since they use a power of two for the number of data blocks, the slab size was - * a power of two, and every block in a slab was a data block. - * - * TODO: Try to figure out some way of structuring testParameters and unit tests so this - * hack isn't needed without having to edit several unit tests every time the metadata size - * changes by one block. - */ data_blocks = slab_size - meta_blocks; - if ((slab_size < 1024) && !is_power_of_2(data_blocks)) - data_blocks = ((block_count_t) 1 << ilog2(data_blocks)); /* * Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in @@ -1221,11 +1208,6 @@ int vdo_validate_config(const struct vdo_config *config, if (result != VDO_SUCCESS) return result; - result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS, - "slab journal size meets minimum size"); - if (result != VDO_SUCCESS) - return result; - result = VDO_ASSERT(config->slab_journal_blocks <= config->slab_size, "slab journal size is within expected bound"); if (result != VDO_SUCCESS) diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c index ae11941c90a9..0613c82bbe8e 100644 --- a/drivers/md/dm-vdo/funnel-workqueue.c +++ b/drivers/md/dm-vdo/funnel-workqueue.c @@ -252,8 +252,7 @@ static void service_work_queue(struct simple_work_queue *queue) * This speeds up some performance tests; that "other work" might include other VDO * threads. */ - if (need_resched()) - cond_resched(); + cond_resched(); } run_finish_hook(queue); diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c index af8fab83b0f3..61edf2b72427 100644 --- a/drivers/md/dm-vdo/indexer/index-layout.c +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -54,7 +54,6 @@ * Each save also has a unique nonce. */ -#define MAGIC_SIZE 32 #define NONCE_INFO_SIZE 32 #define MAX_SAVES 2 @@ -98,9 +97,11 @@ enum region_type { #define SUPER_VERSION_CURRENT 3 #define SUPER_VERSION_MAXIMUM 7 -static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; +static const u8 LAYOUT_MAGIC[] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */ +#define MAGIC_SIZE (sizeof(LAYOUT_MAGIC) - 1) + struct region_header { u64 magic; u64 region_blocks; diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c index aee0914d604a..aa575a24e0b2 100644 --- a/drivers/md/dm-vdo/indexer/index-session.c +++ b/drivers/md/dm-vdo/indexer/index-session.c @@ -100,7 +100,6 @@ static int get_index_session(struct uds_index_session *index_session) int uds_launch_request(struct uds_request *request) { - size_t internal_size; int result; if (request->callback == NULL) { @@ -121,10 +120,7 @@ int uds_launch_request(struct uds_request *request) } /* Reset all internal fields before processing. */ - internal_size = - sizeof(struct uds_request) - offsetof(struct uds_request, zone_number); - // FIXME should be using struct_group for this instead - memset((char *) request + sizeof(*request) - internal_size, 0, internal_size); + memset(&request->internal, 0, sizeof(request->internal)); result = get_index_session(request->session); if (result != UDS_SUCCESS) diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h index 183a94eb7e92..7c1fc4577f5b 100644 --- a/drivers/md/dm-vdo/indexer/indexer.h +++ b/drivers/md/dm-vdo/indexer/indexer.h @@ -8,6 +8,7 @@ #include <linux/mutex.h> #include <linux/sched.h> +#include <linux/stddef.h> #include <linux/types.h> #include <linux/wait.h> @@ -73,7 +74,7 @@ enum uds_request_type { /* Remove any mapping for a name. */ UDS_DELETE, -}; +} __packed; enum uds_open_index_type { /* Create a new index. */ @@ -226,7 +227,7 @@ struct uds_zone_message { enum uds_zone_message_type type; /* The virtual chapter number to which the message applies */ u64 virtual_chapter; -}; +} __packed; struct uds_index_session; struct uds_index; @@ -253,34 +254,32 @@ struct uds_request { /* The existing data associated with the request name, if any */ struct uds_record_data old_metadata; - /* Either UDS_SUCCESS or an error code for the request */ - int status; /* True if the record name had an existing entry in the index */ bool found; + /* Either UDS_SUCCESS or an error code for the request */ + int status; - /* - * The remaining fields are used internally and should not be altered by clients. The index - * relies on zone_number being the first field in this section. - */ - - /* The number of the zone which will process this request*/ - unsigned int zone_number; - /* A link for adding a request to a lock-free queue */ - struct funnel_queue_entry queue_link; - /* A link for adding a request to a standard linked list */ - struct uds_request *next_request; - /* A pointer to the index processing this request */ - struct uds_index *index; - /* Control message for coordinating between zones */ - struct uds_zone_message zone_message; - /* If true, process request immediately by waking the worker thread */ - bool unbatched; - /* If true, continue this request before processing newer requests */ - bool requeued; - /* The virtual chapter containing the record name, if known */ - u64 virtual_chapter; - /* The region of the index containing the record name */ - enum uds_index_region location; + /* The remaining fields are used internally and should not be altered by clients. */ + struct_group(internal, + /* The virtual chapter containing the record name, if known */ + u64 virtual_chapter; + /* The region of the index containing the record name */ + enum uds_index_region location; + /* If true, process request immediately by waking the worker thread */ + bool unbatched; + /* If true, continue this request before processing newer requests */ + bool requeued; + /* Control message for coordinating between zones */ + struct uds_zone_message zone_message; + /* The number of the zone which will process this request*/ + unsigned int zone_number; + /* A link for adding a request to a lock-free queue */ + struct funnel_queue_entry queue_link; + /* A link for adding a request to a standard linked list */ + struct uds_request *next_request; + /* A pointer to the index processing this request */ + struct uds_index *index; + ); }; /* A session is required for most index operations. */ diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c index 12f954a0c532..afb062e1f1fb 100644 --- a/drivers/md/dm-vdo/indexer/volume-index.c +++ b/drivers/md/dm-vdo/indexer/volume-index.c @@ -836,7 +836,7 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index, "%zu bytes decoded of %zu expected", offset, sizeof(buffer)); if (result != VDO_SUCCESS) - result = UDS_CORRUPT_DATA; + return UDS_CORRUPT_DATA; if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) { return vdo_log_warning_strerror(UDS_CORRUPT_DATA, @@ -928,7 +928,7 @@ static int start_restoring_volume_index(struct volume_index *volume_index, "%zu bytes decoded of %zu expected", offset, sizeof(buffer)); if (result != VDO_SUCCESS) - result = UDS_CORRUPT_DATA; + return UDS_CORRUPT_DATA; if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0) return vdo_log_warning_strerror(UDS_CORRUPT_DATA, diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c index 655453bb276b..425b3a74f4db 100644 --- a/drivers/md/dm-vdo/indexer/volume.c +++ b/drivers/md/dm-vdo/indexer/volume.c @@ -754,10 +754,11 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * u32 physical_page, struct cached_page **page_ptr) { struct cached_page *page; + unsigned int zone_number = request->zone_number; get_page_from_cache(&volume->page_cache, physical_page, &page); if (page != NULL) { - if (request->zone_number == 0) { + if (zone_number == 0) { /* Only one zone is allowed to update the LRU. */ make_page_most_recent(&volume->page_cache, page); } @@ -767,7 +768,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * } /* Prepare to enqueue a read for the page. */ - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); mutex_lock(&volume->read_threads_mutex); /* @@ -787,8 +788,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * * the order does not matter for correctness as it does below. */ mutex_unlock(&volume->read_threads_mutex); - begin_pending_search(&volume->page_cache, physical_page, - request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); return UDS_QUEUED; } @@ -797,7 +797,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * * "search pending" state in careful order so no other thread can mess with the data before * the caller gets to look at it. */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); mutex_unlock(&volume->read_threads_mutex); *page_ptr = page; return UDS_SUCCESS; @@ -849,6 +849,7 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r { int result; struct cached_page *page = NULL; + unsigned int zone_number = request->zone_number; u32 physical_page = map_to_physical_page(volume->geometry, chapter, index_page_number); @@ -858,18 +859,18 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r * invalidation by the reader thread, before the reader thread has noticed that the * invalidate_counter has been incremented. */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); result = get_volume_page_protected(volume, request, physical_page, &page); if (result != UDS_SUCCESS) { - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return result; } result = uds_search_chapter_index_page(&page->index_page, volume->geometry, &request->record_name, record_page_number); - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return result; } @@ -882,6 +883,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req { struct cached_page *record_page; struct index_geometry *geometry = volume->geometry; + unsigned int zone_number = request->zone_number; int result; u32 physical_page, page_number; @@ -905,11 +907,11 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req * invalidation by the reader thread, before the reader thread has noticed that the * invalidate_counter has been incremented. */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); result = get_volume_page_protected(volume, request, physical_page, &record_page); if (result != UDS_SUCCESS) { - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return result; } @@ -917,7 +919,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req &request->record_name, geometry, &request->old_metadata)) *found = true; - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return UDS_SUCCESS; } diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c index 421e5436c32c..11d47770b54d 100644 --- a/drivers/md/dm-vdo/io-submitter.c +++ b/drivers/md/dm-vdo/io-submitter.c @@ -327,6 +327,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) * @error_handler: the handler for submission or I/O errors (may be NULL) * @operation: the type of I/O to perform * @data: the buffer to read or write (may be NULL) + * @size: the I/O amount in bytes * * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block * other vdo threads. @@ -338,7 +339,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) */ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, - blk_opf_t operation, char *data) + blk_opf_t operation, char *data, int size) { int result; struct vdo_completion *completion = &vio->completion; @@ -349,7 +350,8 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, vdo_reset_completion(completion); completion->error_handler = error_handler; - result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical); + result = vio_reset_bio_with_size(vio, data, size, callback, operation | REQ_META, + physical); if (result != VDO_SUCCESS) { continue_vio(vio, result); return; diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h index 80748699496f..3088f11055fd 100644 --- a/drivers/md/dm-vdo/io-submitter.h +++ b/drivers/md/dm-vdo/io-submitter.h @@ -8,6 +8,7 @@ #include <linux/bio.h> +#include "constants.h" #include "types.h" struct io_submitter; @@ -26,14 +27,25 @@ void vdo_submit_data_vio(struct data_vio *data_vio); void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, - blk_opf_t operation, char *data); + blk_opf_t operation, char *data, int size); static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, blk_opf_t operation) { __submit_metadata_vio(vio, physical, callback, error_handler, - operation, vio->data); + operation, vio->data, vio->block_count * VDO_BLOCK_SIZE); +} + +static inline void vdo_submit_metadata_vio_with_size(struct vio *vio, + physical_block_number_t physical, + bio_end_io_t callback, + vdo_action_fn error_handler, + blk_opf_t operation, + int size) +{ + __submit_metadata_vio(vio, physical, callback, error_handler, + operation, vio->data, size); } static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, @@ -41,7 +53,7 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, { /* FIXME: Can we just use REQ_OP_FLUSH? */ __submit_metadata_vio(vio, 0, callback, error_handler, - REQ_OP_WRITE | REQ_PREFLUSH, NULL); + REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0); } #endif /* VDO_IO_SUBMITTER_H */ diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c index 3f7dc2cb6b98..76a987ccf926 100644 --- a/drivers/md/dm-vdo/logger.c +++ b/drivers/md/dm-vdo/logger.c @@ -34,7 +34,7 @@ static const char *get_current_interrupt_type(void) if (in_nmi()) return "NMI"; - if (in_irq()) + if (in_hardirq()) return "HI"; if (in_softirq()) diff --git a/drivers/md/dm-vdo/packer.h b/drivers/md/dm-vdo/packer.h index 0f3be44710b5..8c8d6892582d 100644 --- a/drivers/md/dm-vdo/packer.h +++ b/drivers/md/dm-vdo/packer.h @@ -46,7 +46,7 @@ struct compressed_block { /* * Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed - * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with + * block. The bins are kept in a list sorted by the amount of unused space so the first bin with * enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or * is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin. * Upon entering the packer, each data_vio already has its compressed data in the first slot of the diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c index 42d3d8d0e4b5..9bae8256ba4e 100644 --- a/drivers/md/dm-vdo/priority-table.c +++ b/drivers/md/dm-vdo/priority-table.c @@ -199,7 +199,7 @@ void vdo_priority_table_remove(struct priority_table *table, struct list_head *e /* * Remove the entry from the bucket list, remembering a pointer to another entry in the - * ring. + * list. */ next_entry = entry->next; list_del_init(entry); diff --git a/drivers/md/dm-vdo/recovery-journal.h b/drivers/md/dm-vdo/recovery-journal.h index 899071173015..25e7ec6d19f6 100644 --- a/drivers/md/dm-vdo/recovery-journal.h +++ b/drivers/md/dm-vdo/recovery-journal.h @@ -43,9 +43,9 @@ * has a vio which is used to commit that block to disk. The vio's data is the on-disk * representation of the journal block. In addition each in-memory block has a buffer which is used * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are - * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active - * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is - * moved back to the 'free_tail_blocks' ring. + * kept on two lists. Free blocks live on the 'free_tail_blocks' list. When a block becomes active + * (see below) it is moved to the 'active_tail_blocks' list. When a block is fully committed, it is + * moved back to the 'free_tail_blocks' list. * * When entries are added to the journal, they are added to the active in-memory block, as * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 8f0a35c63af6..f3d80ff7bef5 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -139,7 +139,7 @@ static bool is_slab_journal_blank(const struct vdo_slab *slab) } /** - * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct + * mark_slab_journal_dirty() - Put a slab journal on the dirty list of its allocator in the correct * order. * @journal: The journal to be marked dirty. * @lock: The recovery journal lock held by the slab journal. @@ -414,8 +414,7 @@ static void complete_reaping(struct vdo_completion *completion) { struct slab_journal *journal = completion->parent; - return_vio_to_pool(journal->slab->allocator->vio_pool, - vio_as_pooled_vio(as_vio(vdo_forget(completion)))); + return_vio_to_pool(vio_as_pooled_vio(as_vio(completion))); finish_reaping(journal); reap_slab_journal(journal); } @@ -698,7 +697,7 @@ static void complete_write(struct vdo_completion *completion) sequence_number_t committed = get_committing_sequence_number(pooled); list_del_init(&pooled->list_entry); - return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled)); + return_vio_to_pool(pooled); if (result != VDO_SUCCESS) { vio_record_metadata_io_error(as_vio(completion)); @@ -822,7 +821,7 @@ static void commit_tail(struct slab_journal *journal) /* * Since we are about to commit the tail block, this journal no longer needs to be on the - * ring of journals which the recovery journal might ask to commit. + * list of journals which the recovery journal might ask to commit. */ mark_slab_journal_clean(journal); @@ -1076,7 +1075,7 @@ static void finish_reference_block_write(struct vdo_completion *completion) /* Release the slab journal lock. */ adjust_slab_journal_block_reference(&slab->journal, block->slab_journal_lock_to_release, -1); - return_vio_to_pool(slab->allocator->vio_pool, pooled); + return_vio_to_pool(pooled); /* * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause @@ -1170,8 +1169,8 @@ static void handle_io_error(struct vdo_completion *completion) struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab; vio_record_metadata_io_error(vio); - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); - slab->active_count--; + return_vio_to_pool(vio_as_pooled_vio(vio)); + slab->active_count -= vio->io_size / VDO_BLOCK_SIZE; vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); check_if_slab_drained(slab); } @@ -1372,7 +1371,7 @@ static unsigned int calculate_slab_priority(struct vdo_slab *slab) static void prioritize_slab(struct vdo_slab *slab) { VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), - "a slab must not already be on a ring when prioritizing"); + "a slab must not already be on a list when prioritizing"); slab->priority = calculate_slab_priority(slab); vdo_priority_table_enqueue(slab->allocator->prioritized_slabs, slab->priority, &slab->allocq_entry); @@ -2165,28 +2164,95 @@ static void dirty_all_reference_blocks(struct vdo_slab *slab) dirty_block(&slab->reference_blocks[i]); } +static inline bool journal_points_equal(struct journal_point first, + struct journal_point second) +{ + return ((first.sequence_number == second.sequence_number) && + (first.entry_count == second.entry_count)); +} + /** - * clear_provisional_references() - Clear the provisional reference counts from a reference block. - * @block: The block to clear. + * match_bytes() - Check an 8-byte word for bytes matching the value specified + * @input: A word to examine the bytes of + * @match: The byte value sought + * + * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise */ -static void clear_provisional_references(struct reference_block *block) +static inline u64 match_bytes(u64 input, u8 match) { - vdo_refcount_t *counters = get_reference_counters_for_block(block); - block_count_t j; + u64 temp = input ^ (match * 0x0101010101010101ULL); + /* top bit of each byte is set iff top bit of temp byte is clear; rest are 0 */ + u64 test_top_bits = ~temp & 0x8080808080808080ULL; + /* top bit of each byte is set iff low 7 bits of temp byte are clear; rest are useless */ + u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL); + /* return 1 when both tests indicate temp byte is 0 */ + return (test_top_bits & test_low_bits) >> 7; +} + +/** + * count_valid_references() - Process a newly loaded refcount array + * @counters: the array of counters from a metadata block + * + * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't + * cleaned up at shutdown, changing them internally to "empty". + * + * Return: the number of blocks that are referenced (counters not "empty") + */ +static unsigned int count_valid_references(vdo_refcount_t *counters) +{ + u64 *words = (u64 *)counters; + /* It's easier to count occurrences of a specific byte than its absences. */ + unsigned int empty_count = 0; + /* For speed, we process 8 bytes at once. */ + unsigned int words_left = COUNTS_PER_BLOCK / sizeof(u64); + + /* + * Sanity check assumptions used for optimizing this code: Counters are bytes. The counter + * array is a multiple of the word size. + */ + BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1); + BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0); + + while (words_left > 0) { + /* + * This is used effectively as 8 byte-size counters. Byte 0 counts how many words + * had the target value found in byte 0, etc. We just have to avoid overflow. + */ + u64 split_count = 0; + /* + * The counter "% 255" trick used below to fold split_count into empty_count + * imposes a limit of 254 bytes examined each iteration of the outer loop. We + * process a word at a time, so that limit gets rounded down to 31 u64 words. + */ + const unsigned int max_words_per_iteration = 254 / sizeof(u64); + unsigned int iter_words_left = min_t(unsigned int, words_left, + max_words_per_iteration); + + words_left -= iter_words_left; + + while (iter_words_left--) { + u64 word = *words; + u64 temp; + + /* First, if we have any provisional refcount values, clear them. */ + temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT); + if (temp) { + /* + * 'temp' has 0x01 bytes where 'word' has PROVISIONAL; this xor + * will alter just those bytes, changing PROVISIONAL to EMPTY. + */ + word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT); + *words = word; + } - for (j = 0; j < COUNTS_PER_BLOCK; j++) { - if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { - counters[j] = EMPTY_REFERENCE_COUNT; - block->allocated_count--; + /* Now count the EMPTY_REFERENCE_COUNT bytes, updating the 8 counters. */ + split_count += match_bytes(word, EMPTY_REFERENCE_COUNT); + words++; } + empty_count += split_count % 255; } -} -static inline bool journal_points_equal(struct journal_point first, - struct journal_point second) -{ - return ((first.sequence_number == second.sequence_number) && - (first.entry_count == second.entry_count)); + return COUNTS_PER_BLOCK - empty_count; } /** @@ -2197,7 +2263,6 @@ static inline bool journal_points_equal(struct journal_point first, static void unpack_reference_block(struct packed_reference_block *packed, struct reference_block *block) { - block_count_t index; sector_count_t i; struct vdo_slab *slab = block->slab; vdo_refcount_t *counters = get_reference_counters_for_block(block); @@ -2223,11 +2288,7 @@ static void unpack_reference_block(struct packed_reference_block *packed, } } - block->allocated_count = 0; - for (index = 0; index < COUNTS_PER_BLOCK; index++) { - if (counters[index] != EMPTY_REFERENCE_COUNT) - block->allocated_count++; - } + block->allocated_count = count_valid_references(counters); } /** @@ -2240,13 +2301,19 @@ static void finish_reference_block_load(struct vdo_completion *completion) struct pooled_vio *pooled = vio_as_pooled_vio(vio); struct reference_block *block = completion->parent; struct vdo_slab *slab = block->slab; + unsigned int block_count = vio->io_size / VDO_BLOCK_SIZE; + unsigned int i; + char *data = vio->data; - unpack_reference_block((struct packed_reference_block *) vio->data, block); - return_vio_to_pool(slab->allocator->vio_pool, pooled); - slab->active_count--; - clear_provisional_references(block); + for (i = 0; i < block_count; i++, block++, data += VDO_BLOCK_SIZE) { + struct packed_reference_block *packed = (struct packed_reference_block *) data; + + unpack_reference_block(packed, block); + slab->free_blocks -= block->allocated_count; + } + return_vio_to_pool(pooled); + slab->active_count -= block_count; - slab->free_blocks -= block->allocated_count; check_if_slab_drained(slab); } @@ -2260,23 +2327,25 @@ static void load_reference_block_endio(struct bio *bio) } /** - * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the - * block. - * @waiter: The waiter of the block to load. + * load_reference_block_group() - After a block waiter has gotten a VIO from the VIO pool, load + * a set of blocks. + * @waiter: The waiter of the first block to load. * @context: The VIO returned by the pool. */ -static void load_reference_block(struct vdo_waiter *waiter, void *context) +static void load_reference_block_group(struct vdo_waiter *waiter, void *context) { struct pooled_vio *pooled = context; struct vio *vio = &pooled->vio; struct reference_block *block = container_of(waiter, struct reference_block, waiter); - size_t block_offset = (block - block->slab->reference_blocks); + u32 block_offset = block - block->slab->reference_blocks; + u32 max_block_count = block->slab->reference_block_count - block_offset; + u32 block_count = min_t(int, vio->block_count, max_block_count); vio->completion.parent = block; - vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset, - load_reference_block_endio, handle_io_error, - REQ_OP_READ); + vdo_submit_metadata_vio_with_size(vio, block->slab->ref_counts_origin + block_offset, + load_reference_block_endio, handle_io_error, + REQ_OP_READ, block_count * VDO_BLOCK_SIZE); } /** @@ -2286,14 +2355,21 @@ static void load_reference_block(struct vdo_waiter *waiter, void *context) static void load_reference_blocks(struct vdo_slab *slab) { block_count_t i; + u64 blocks_per_vio = slab->allocator->refcount_blocks_per_big_vio; + struct vio_pool *pool = slab->allocator->refcount_big_vio_pool; + + if (!pool) { + pool = slab->allocator->vio_pool; + blocks_per_vio = 1; + } slab->free_blocks = slab->block_count; slab->active_count = slab->reference_block_count; - for (i = 0; i < slab->reference_block_count; i++) { + for (i = 0; i < slab->reference_block_count; i += blocks_per_vio) { struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter; - waiter->callback = load_reference_block; - acquire_vio_from_pool(slab->allocator->vio_pool, waiter); + waiter->callback = load_reference_block_group; + acquire_vio_from_pool(pool, waiter); } } @@ -2429,7 +2505,7 @@ static void finish_loading_journal(struct vdo_completion *completion) initialize_journal_state(journal); } - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + return_vio_to_pool(vio_as_pooled_vio(vio)); vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab)); } @@ -2449,7 +2525,7 @@ static void handle_load_error(struct vdo_completion *completion) struct vio *vio = as_vio(completion); vio_record_metadata_io_error(vio); - return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + return_vio_to_pool(vio_as_pooled_vio(vio)); vdo_finish_loading_with_result(&journal->slab->state, result); } @@ -2547,7 +2623,7 @@ static void queue_slab(struct vdo_slab *slab) int result; VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), - "a requeued slab must not already be on a ring"); + "a requeued slab must not already be on a list"); if (vdo_is_read_only(allocator->depot->vdo)) return; @@ -2700,6 +2776,7 @@ static void finish_scrubbing(struct slab_scrubber *scrubber, int result) vdo_log_info("VDO commencing normal operation"); else if (prior_state == VDO_RECOVERING) vdo_log_info("Exiting recovery mode"); + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); } /* @@ -3281,7 +3358,7 @@ int vdo_release_block_reference(struct block_allocator *allocator, * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as * the primary key and the 'emptiness' field as the secondary key. * - * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping + * Slabs need to be pushed onto the lists in the same order they are to be popped off. Popping * should always get the most empty first, so pushing should be from most empty to least empty. * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements * before larger ones. @@ -3983,6 +4060,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, struct vdo *vdo = depot->vdo; block_count_t max_free_blocks = depot->slab_config.data_blocks; unsigned int max_priority = (2 + ilog2(max_free_blocks)); + u32 reference_block_count, refcount_reads_needed, refcount_blocks_per_vio; *allocator = (struct block_allocator) { .depot = depot, @@ -4000,12 +4078,24 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, return result; vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION); - result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id, + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, 1, allocator->thread_id, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, allocator, &allocator->vio_pool); if (result != VDO_SUCCESS) return result; + /* Initialize the refcount-reading vio pool. */ + reference_block_count = vdo_get_saved_reference_count_size(depot->slab_config.slab_blocks); + refcount_reads_needed = DIV_ROUND_UP(reference_block_count, MAX_BLOCKS_PER_VIO); + refcount_blocks_per_vio = DIV_ROUND_UP(reference_block_count, refcount_reads_needed); + allocator->refcount_blocks_per_big_vio = refcount_blocks_per_vio; + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE, + allocator->refcount_blocks_per_big_vio, allocator->thread_id, + VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, + NULL, &allocator->refcount_big_vio_pool); + if (result != VDO_SUCCESS) + return result; + result = initialize_slab_scrubber(allocator); if (result != VDO_SUCCESS) return result; @@ -4223,6 +4313,7 @@ void vdo_free_slab_depot(struct slab_depot *depot) uninitialize_allocator_summary(allocator); uninitialize_scrubber_vio(&allocator->scrubber); free_vio_pool(vdo_forget(allocator->vio_pool)); + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs)); } diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h index f234853501ca..fadc0c9d4dc4 100644 --- a/drivers/md/dm-vdo/slab-depot.h +++ b/drivers/md/dm-vdo/slab-depot.h @@ -45,6 +45,13 @@ enum { /* The number of vios in the vio pool is proportional to the throughput of the VDO. */ BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128, + + /* + * The number of vios in the vio pool used for loading reference count data. A slab's + * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be + * plenty. + */ + BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9, }; /* @@ -248,7 +255,7 @@ struct vdo_slab { /* A list of the dirty blocks waiting to be written out */ struct vdo_wait_queue dirty_blocks; - /* The number of blocks which are currently writing */ + /* The number of blocks which are currently reading or writing */ size_t active_count; /* A waiter object for updating the slab summary */ @@ -425,6 +432,10 @@ struct block_allocator { /* The vio pool for reading and writing block allocator metadata */ struct vio_pool *vio_pool; + /* The vio pool for large initial reads of ref count areas */ + struct vio_pool *refcount_big_vio_pool; + /* How many ref count blocks are read per vio at initial load */ + u32 refcount_blocks_per_big_vio; /* The dm_kcopyd client for erasing slab journals */ struct dm_kcopyd_client *eraser; /* Iterator over the slabs to be erased */ diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h index dbe892b10f26..cdf36e7d7702 100644 --- a/drivers/md/dm-vdo/types.h +++ b/drivers/md/dm-vdo/types.h @@ -376,6 +376,9 @@ struct vio { /* The size of this vio in blocks */ unsigned int block_count; + /* The amount of data to be read or written, in bytes */ + unsigned int io_size; + /* The data being read or written. */ char *data; diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index a7e32baab4af..80b608674022 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -31,9 +31,7 @@ #include <linux/completion.h> #include <linux/device-mapper.h> -#include <linux/kernel.h> #include <linux/lz4.h> -#include <linux/module.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/types.h> @@ -142,12 +140,6 @@ static void finish_vdo_request_queue(void *ptr) vdo_unregister_allocating_thread(); } -#ifdef MODULE -#define MODULE_NAME THIS_MODULE->name -#else -#define MODULE_NAME "dm-vdo" -#endif /* MODULE */ - static const struct vdo_work_queue_type default_queue_type = { .start = start_vdo_request_queue, .finish = finish_vdo_request_queue, @@ -559,8 +551,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason, *vdo_ptr = vdo; snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix), - "%s%u", MODULE_NAME, instance); - BUG_ON(vdo->thread_name_prefix[0] == '\0'); + "vdo%u", instance); result = vdo_allocate(vdo->thread_config.thread_count, struct vdo_thread, __func__, &vdo->threads); if (result != VDO_SUCCESS) { diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index e710f3c5a972..8fc22fb14196 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -188,14 +188,23 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb /* * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated - * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a - * vio associated with the bio. + * bio, as it assumes the bio wraps a 4k-multiple buffer that is 4k aligned, but there does not + * have to be a vio associated with the bio. */ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, blk_opf_t bi_opf, physical_block_number_t pbn) { - int bvec_count, offset, len, i; + return vio_reset_bio_with_size(vio, data, vio->block_count * VDO_BLOCK_SIZE, + callback, bi_opf, pbn); +} + +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, + blk_opf_t bi_opf, physical_block_number_t pbn) +{ + int bvec_count, offset, i; struct bio *bio = vio->bio; + int vio_size = vio->block_count * VDO_BLOCK_SIZE; + int remaining; bio_reset(bio, bio->bi_bdev, bi_opf); vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn); @@ -203,24 +212,23 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, return VDO_SUCCESS; bio->bi_ioprio = 0; - bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_io_vec = bio_inline_vecs(bio); bio->bi_max_vecs = vio->block_count + 1; - len = VDO_BLOCK_SIZE * vio->block_count; + if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", + size, vio_size) != VDO_SUCCESS) + size = vio_size; + vio->io_size = size; offset = offset_in_page(data); - bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE); + bvec_count = DIV_ROUND_UP(offset + size, PAGE_SIZE); + remaining = size; - /* - * If we knew that data was always on one page, or contiguous pages, we wouldn't need the - * loop. But if we're using vmalloc, it's not impossible that the data is in different - * pages that can't be merged in bio_add_page... - */ - for (i = 0; (i < bvec_count) && (len > 0); i++) { + for (i = 0; (i < bvec_count) && (remaining > 0); i++) { struct page *page; int bytes_added; int bytes = PAGE_SIZE - offset; - if (bytes > len) - bytes = len; + if (bytes > remaining) + bytes = remaining; page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); bytes_added = bio_add_page(bio, page, bytes, offset); @@ -232,7 +240,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, } data += bytes; - len -= bytes; + remaining -= bytes; offset = 0; } @@ -301,6 +309,7 @@ void vio_record_metadata_io_error(struct vio *vio) * make_vio_pool() - Create a new vio pool. * @vdo: The vdo. * @pool_size: The number of vios in the pool. + * @block_count: The number of 4k blocks per vio. * @thread_id: The ID of the thread using this pool. * @vio_type: The type of vios in the pool. * @priority: The priority with which vios from the pool should be enqueued. @@ -309,13 +318,14 @@ void vio_record_metadata_io_error(struct vio *vio) * * Return: A success or error code. */ -int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, +int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_id_t thread_id, enum vio_type vio_type, enum vio_priority priority, void *context, struct vio_pool **pool_ptr) { struct vio_pool *pool; char *ptr; int result; + size_t per_vio_size = VDO_BLOCK_SIZE * block_count; result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio, __func__, &pool); @@ -326,7 +336,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, INIT_LIST_HEAD(&pool->available); INIT_LIST_HEAD(&pool->busy); - result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char, + result = vdo_allocate(pool_size * per_vio_size, char, "VIO pool buffer", &pool->buffer); if (result != VDO_SUCCESS) { free_vio_pool(pool); @@ -334,10 +344,10 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, } ptr = pool->buffer; - for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) { + for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += per_vio_size) { struct pooled_vio *pooled = &pool->vios[pool->size]; - result = allocate_vio_components(vdo, vio_type, priority, NULL, 1, ptr, + result = allocate_vio_components(vdo, vio_type, priority, NULL, block_count, ptr, &pooled->vio); if (result != VDO_SUCCESS) { free_vio_pool(pool); @@ -345,6 +355,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, } pooled->context = context; + pooled->pool = pool; list_add_tail(&pooled->pool_entry, &pool->available); } @@ -419,12 +430,13 @@ void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter) } /** - * return_vio_to_pool() - Return a vio to the pool - * @pool: The vio pool. + * return_vio_to_pool() - Return a vio to its pool * @vio: The pooled vio to return. */ -void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio) +void return_vio_to_pool(struct pooled_vio *vio) { + struct vio_pool *pool = vio->pool; + VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()), "vio pool entry returned on same thread as it was acquired"); diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h index 3490e9f59b04..4bfcb21901f1 100644 --- a/drivers/md/dm-vdo/vio.h +++ b/drivers/md/dm-vdo/vio.h @@ -30,6 +30,8 @@ struct pooled_vio { void *context; /* The list entry used by the pool */ struct list_head pool_entry; + /* The pool this vio is allocated from */ + struct vio_pool *pool; }; /** @@ -123,6 +125,8 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, blk_opf_t bi_opf, physical_block_number_t pbn); +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, + blk_opf_t bi_opf, physical_block_number_t pbn); void update_vio_error_stats(struct vio *vio, const char *format, ...) __printf(2, 3); @@ -188,12 +192,13 @@ static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio) struct vio_pool; -int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, - enum vio_type vio_type, enum vio_priority priority, - void *context, struct vio_pool **pool_ptr); +int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, + thread_id_t thread_id, enum vio_type vio_type, + enum vio_priority priority, void *context, + struct vio_pool **pool_ptr); void free_vio_pool(struct vio_pool *pool); bool __must_check is_vio_pool_busy(struct vio_pool *pool); void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter); -void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio); +void return_vio_to_pool(struct pooled_vio *vio); #endif /* VIO_H */ diff --git a/drivers/md/dm-vdo/wait-queue.c b/drivers/md/dm-vdo/wait-queue.c index 6e1e739277ef..f81ed0cee2bf 100644 --- a/drivers/md/dm-vdo/wait-queue.c +++ b/drivers/md/dm-vdo/wait-queue.c @@ -34,7 +34,7 @@ void vdo_waitq_enqueue_waiter(struct vdo_wait_queue *waitq, struct vdo_waiter *w waitq->last_waiter->next_waiter = waiter; } - /* In both cases, the waiter we added to the ring becomes the last waiter. */ + /* In both cases, the waiter we added to the list becomes the last waiter. */ waitq->last_waiter = waiter; waitq->length += 1; } diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 62b1a44b8dd2..72047b47a7a0 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -40,35 +40,23 @@ static inline u64 fec_interleave(struct dm_verity *v, u64 offset) } /* - * Decode an RS block using Reed-Solomon. - */ -static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, - u8 *data, u8 *fec, int neras) -{ - int i; - uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; - - for (i = 0; i < v->fec->roots; i++) - par[i] = fec[i]; - - return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras, - fio->erasures, 0, NULL); -} - -/* * Read error-correcting codes for the requested RS block. Returns a pointer * to the data block. Caller is responsible for releasing buf. */ static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, - unsigned int *offset, struct dm_buffer **buf, - unsigned short ioprio) + unsigned int *offset, unsigned int par_buf_offset, + struct dm_buffer **buf, unsigned short ioprio) { u64 position, block, rem; u8 *res; + /* We have already part of parity bytes read, skip to the next block */ + if (par_buf_offset) + index++; + position = (index + rsb) * v->fec->roots; block = div64_u64_rem(position, v->fec->io_size, &rem); - *offset = (unsigned int)rem; + *offset = par_buf_offset ? 0 : (unsigned int)rem; res = dm_bufio_read_with_ioprio(v->fec->bufio, block, buf, ioprio); if (IS_ERR(res)) { @@ -128,11 +116,13 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, { int r, corrected = 0, res; struct dm_buffer *buf; - unsigned int n, i, offset; + unsigned int n, i, j, offset, par_buf_offset = 0; + uint16_t par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; u8 *par, *block; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio)); + par = fec_read_parity(v, rsb, block_offset, &offset, + par_buf_offset, &buf, bio->bi_ioprio); if (IS_ERR(par)) return PTR_ERR(par); @@ -142,7 +132,11 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, */ fec_for_each_buffer_rs_block(fio, n, i) { block = fec_buffer_rs_block(v, fio, n, i); - res = fec_decode_rs8(v, fio, block, &par[offset], neras); + for (j = 0; j < v->fec->roots - par_buf_offset; j++) + par_buf[par_buf_offset + j] = par[offset + j]; + /* Decode an RS block using Reed-Solomon */ + res = decode_rs8(fio->rs, block, par_buf, v->fec->rsn, + NULL, neras, fio->erasures, 0, NULL); if (res < 0) { r = res; goto error; @@ -155,12 +149,22 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, if (block_offset >= 1 << v->data_dev_block_bits) goto done; - /* read the next block when we run out of parity bytes */ - offset += v->fec->roots; + /* Read the next block when we run out of parity bytes */ + offset += (v->fec->roots - par_buf_offset); + /* Check if parity bytes are split between blocks */ + if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) { + par_buf_offset = v->fec->io_size - offset; + for (j = 0; j < par_buf_offset; j++) + par_buf[j] = par[offset + j]; + offset += par_buf_offset; + } else + par_buf_offset = 0; + if (offset >= v->fec->io_size) { dm_bufio_release(buf); - par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio)); + par = fec_read_parity(v, rsb, block_offset, &offset, + par_buf_offset, &buf, bio->bi_ioprio); if (IS_ERR(par)) return PTR_ERR(par); } @@ -187,7 +191,7 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, u8 *want_digest, u8 *data) { if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io), true))) + verity_io_real_digest(v, io)))) return 0; return memcmp(verity_io_real_digest(v, io), want_digest, @@ -250,7 +254,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, bufio = v->bufio; } - bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio_prio(bio)); + bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio->bi_ioprio); if (IS_ERR(bbuf)) { DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld", v->data_dev->name, @@ -316,11 +320,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) if (fio->bufs[n]) continue; - fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOWAIT); - if (unlikely(!fio->bufs[n])) { - DMERR("failed to allocate FEC buffer"); - return -ENOMEM; - } + fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOIO); } /* try to allocate the maximum number of buffers */ @@ -388,7 +388,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, /* Always re-validate the corrected block against the expected hash */ r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io), true); + verity_io_real_digest(v, io)); if (unlikely(r < 0)) return r; @@ -589,6 +589,10 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, (*argc)--; if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) { + if (v->fec->dev) { + ti->error = "FEC device already specified"; + return -EINVAL; + } r = dm_get_device(ti, arg_value, BLK_OPEN_READ, &v->fec->dev); if (r) { ti->error = "FEC device lookup failed"; @@ -724,10 +728,7 @@ int verity_fec_ctr(struct dm_verity *v) return -E2BIG; } - if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1)) - f->io_size = 1 << v->data_dev_block_bits; - else - f->io_size = v->fec->roots << SECTOR_SHIFT; + f->io_size = 1 << v->data_dev_block_bits; f->bufio = dm_bufio_client_create(f->dev->bdev, f->io_size, diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 47d595f6a76e..66a00a8ccb39 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -19,7 +19,6 @@ #include "dm-audit.h" #include <linux/module.h> #include <linux/reboot.h> -#include <linux/scatterlist.h> #include <linux/string.h> #include <linux/jump_label.h> #include <linux/security.h> @@ -30,6 +29,7 @@ #define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR" #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 +#define DM_VERITY_USE_BH_DEFAULT_BYTES 8192 #define DM_VERITY_MAX_CORRUPTED_ERRS 100 @@ -49,10 +49,16 @@ static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644); -static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); +static unsigned int dm_verity_use_bh_bytes[4] = { + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_NONE + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_RT + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_BE + 0 // IOPRIO_CLASS_IDLE +}; -/* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */ -static DEFINE_STATIC_KEY_FALSE(ahash_enabled); +module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644); + +static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); struct dm_verity_prefetch_work { struct work_struct work; @@ -108,100 +114,21 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, return block >> (level * v->hash_per_block_bits); } -static int verity_ahash_update(struct dm_verity *v, struct ahash_request *req, - const u8 *data, size_t len, - struct crypto_wait *wait) -{ - struct scatterlist sg; - - if (likely(!is_vmalloc_addr(data))) { - sg_init_one(&sg, data, len); - ahash_request_set_crypt(req, &sg, NULL, len); - return crypto_wait_req(crypto_ahash_update(req), wait); - } - - do { - int r; - size_t this_step = min_t(size_t, len, PAGE_SIZE - offset_in_page(data)); - - flush_kernel_vmap_range((void *)data, this_step); - sg_init_table(&sg, 1); - sg_set_page(&sg, vmalloc_to_page(data), this_step, offset_in_page(data)); - ahash_request_set_crypt(req, &sg, NULL, this_step); - r = crypto_wait_req(crypto_ahash_update(req), wait); - if (unlikely(r)) - return r; - data += this_step; - len -= this_step; - } while (len); - - return 0; -} - -/* - * Wrapper for crypto_ahash_init, which handles verity salting. - */ -static int verity_ahash_init(struct dm_verity *v, struct ahash_request *req, - struct crypto_wait *wait, bool may_sleep) -{ - int r; - - ahash_request_set_tfm(req, v->ahash_tfm); - ahash_request_set_callback(req, - may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0, - crypto_req_done, (void *)wait); - crypto_init_wait(wait); - - r = crypto_wait_req(crypto_ahash_init(req), wait); - - if (unlikely(r < 0)) { - if (r != -ENOMEM) - DMERR("crypto_ahash_init failed: %d", r); - return r; - } - - if (likely(v->salt_size && (v->version >= 1))) - r = verity_ahash_update(v, req, v->salt, v->salt_size, wait); - - return r; -} - -static int verity_ahash_final(struct dm_verity *v, struct ahash_request *req, - u8 *digest, struct crypto_wait *wait) -{ - int r; - - if (unlikely(v->salt_size && (!v->version))) { - r = verity_ahash_update(v, req, v->salt, v->salt_size, wait); - - if (r < 0) { - DMERR("%s failed updating salt: %d", __func__, r); - goto out; - } - } - - ahash_request_set_crypt(req, NULL, digest, 0); - r = crypto_wait_req(crypto_ahash_final(req), wait); -out: - return r; -} - int verity_hash(struct dm_verity *v, struct dm_verity_io *io, - const u8 *data, size_t len, u8 *digest, bool may_sleep) + const u8 *data, size_t len, u8 *digest) { + struct shash_desc *desc = &io->hash_desc; int r; - if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) { - struct ahash_request *req = verity_io_hash_req(v, io); - struct crypto_wait wait; - - r = verity_ahash_init(v, req, &wait, may_sleep) ?: - verity_ahash_update(v, req, data, len, &wait) ?: - verity_ahash_final(v, req, digest, &wait); + desc->tfm = v->shash_tfm; + if (unlikely(v->initial_hashstate == NULL)) { + /* Version 0: salt at end */ + r = crypto_shash_init(desc) ?: + crypto_shash_update(desc, data, len) ?: + crypto_shash_update(desc, v->salt, v->salt_size) ?: + crypto_shash_final(desc, digest); } else { - struct shash_desc *desc = verity_io_hash_req(v, io); - - desc->tfm = v->shash_tfm; + /* Version 1: salt at beginning */ r = crypto_shash_import(desc, v->initial_hashstate) ?: crypto_shash_finup(desc, data, len, digest); } @@ -311,7 +238,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { data = dm_bufio_get(v->bufio, hash_block, &buf); - if (data == NULL) { + if (IS_ERR_OR_NULL(data)) { /* * In tasklet and the hash was not in the bufio cache. * Return early and resume execution from a work-queue @@ -321,11 +248,27 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } } else { data = dm_bufio_read_with_ioprio(v->bufio, hash_block, - &buf, bio_prio(bio)); + &buf, bio->bi_ioprio); } - if (IS_ERR(data)) - return PTR_ERR(data); + if (IS_ERR(data)) { + if (skip_unverified) + return 1; + r = PTR_ERR(data); + data = dm_bufio_new(v->bufio, hash_block, &buf); + if (IS_ERR(data)) + return r; + if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, + hash_block, data) == 0) { + aux = dm_bufio_get_aux_data(buf); + aux->hash_verified = 1; + goto release_ok; + } else { + dm_bufio_release(buf); + dm_bufio_forget(v->bufio, hash_block); + return r; + } + } aux = dm_bufio_get_aux_data(buf); @@ -336,7 +279,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } r = verity_hash(v, io, data, 1 << v->hash_dev_block_bits, - verity_io_real_digest(v, io), !io->in_bh); + verity_io_real_digest(v, io)); if (unlikely(r < 0)) goto release_ret_r; @@ -366,6 +309,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } } +release_ok: data += offset; memcpy(want_digest, data, v->digest_size); r = 0; @@ -438,7 +382,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, goto free_ret; r = verity_hash(v, io, buffer, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io), true); + verity_io_real_digest(v, io)); if (unlikely(r)) goto free_ret; @@ -554,7 +498,7 @@ static int verity_verify_io(struct dm_verity_io *io) } r = verity_hash(v, io, data, block_size, - verity_io_real_digest(v, io), !io->in_bh); + verity_io_real_digest(v, io)); if (unlikely(r < 0)) { kunmap_local(data); return r; @@ -652,9 +596,18 @@ static void verity_bh_work(struct work_struct *w) verity_finish_io(io, errno_to_blk_status(err)); } +static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio) +{ + return ioprio <= IOPRIO_CLASS_IDLE && + bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]) && + !need_resched(); +} + static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; + unsigned short ioprio = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + unsigned int bytes = io->n_blocks << io->v->data_dev_block_bits; if (bio->bi_status && (!verity_fec_is_enabled(io->v) || @@ -664,9 +617,14 @@ static void verity_end_io(struct bio *bio) return; } - if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq) { - INIT_WORK(&io->bh_work, verity_bh_work); - queue_work(system_bh_wq, &io->bh_work); + if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq && + verity_use_bh(bytes, ioprio)) { + if (in_hardirq() || irqs_disabled()) { + INIT_WORK(&io->bh_work, verity_bh_work); + queue_work(system_bh_wq, &io->bh_work); + } else { + verity_bh_work(&io->bh_work); + } } else { INIT_WORK(&io->work, verity_work); queue_work(io->v->verify_wq, &io->work); @@ -789,13 +747,20 @@ static int verity_map(struct dm_target *ti, struct bio *bio) verity_fec_init_io(io); - verity_submit_prefetch(v, io, bio_prio(bio)); + verity_submit_prefetch(v, io, bio->bi_ioprio); submit_bio_noacct(bio); return DM_MAPIO_SUBMITTED; } +static void verity_postsuspend(struct dm_target *ti) +{ + struct dm_verity *v = ti->private; + flush_workqueue(v->verify_wq); + dm_bufio_client_reset(v->bufio); +} + /* * Status: V (valid) or C (corruption found) */ @@ -946,7 +911,9 @@ static void verity_status(struct dm_target *ti, status_type_t type, } } -static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct dm_verity *v = ti->private; @@ -1042,12 +1009,7 @@ static void verity_dtr(struct dm_target *ti) kfree(v->zero_digest); verity_free_sig(v); - if (v->ahash_tfm) { - static_branch_dec(&ahash_enabled); - crypto_free_ahash(v->ahash_tfm); - } else { - crypto_free_shash(v->shash_tfm); - } + crypto_free_shash(v->shash_tfm); kfree(v->alg_name); @@ -1073,6 +1035,9 @@ static int verity_alloc_most_once(struct dm_verity *v) { struct dm_target *ti = v->ti; + if (v->validated_blocks) + return 0; + /* the bitset can only handle INT_MAX blocks */ if (v->data_blocks > INT_MAX) { ti->error = "device too large to use check_at_most_once"; @@ -1096,12 +1061,16 @@ static int verity_alloc_zero_digest(struct dm_verity *v) struct dm_verity_io *io; u8 *zero_data; + if (v->zero_digest) + return 0; + v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); if (!v->zero_digest) return r; - io = kmalloc(sizeof(*io) + v->hash_reqsize, GFP_KERNEL); + io = kmalloc(sizeof(*io) + crypto_shash_descsize(v->shash_tfm), + GFP_KERNEL); if (!io) return r; /* verity_dtr will free zero_digest */ @@ -1112,7 +1081,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v) goto out; r = verity_hash(v, io, zero_data, 1 << v->data_dev_block_bits, - v->zero_digest, true); + v->zero_digest); out: kfree(io); @@ -1268,9 +1237,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) { struct dm_target *ti = v->ti; - struct crypto_ahash *ahash; - struct crypto_shash *shash = NULL; - const char *driver_name; + struct crypto_shash *shash; v->alg_name = kstrdup(alg_name, GFP_KERNEL); if (!v->alg_name) { @@ -1278,50 +1245,14 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) return -ENOMEM; } - /* - * Allocate the hash transformation object that this dm-verity instance - * will use. The vast majority of dm-verity users use CPU-based - * hashing, so when possible use the shash API to minimize the crypto - * API overhead. If the ahash API resolves to a different driver - * (likely an off-CPU hardware offload), use ahash instead. Also use - * ahash if the obsolete dm-verity format with the appended salt is - * being used, so that quirk only needs to be handled in one place. - */ - ahash = crypto_alloc_ahash(alg_name, 0, - v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0); - if (IS_ERR(ahash)) { + shash = crypto_alloc_shash(alg_name, 0, 0); + if (IS_ERR(shash)) { ti->error = "Cannot initialize hash function"; - return PTR_ERR(ahash); - } - driver_name = crypto_ahash_driver_name(ahash); - if (v->version >= 1 /* salt prepended, not appended? */) { - shash = crypto_alloc_shash(alg_name, 0, 0); - if (!IS_ERR(shash) && - strcmp(crypto_shash_driver_name(shash), driver_name) != 0) { - /* - * ahash gave a different driver than shash, so probably - * this is a case of real hardware offload. Use ahash. - */ - crypto_free_shash(shash); - shash = NULL; - } - } - if (!IS_ERR_OR_NULL(shash)) { - crypto_free_ahash(ahash); - ahash = NULL; - v->shash_tfm = shash; - v->digest_size = crypto_shash_digestsize(shash); - v->hash_reqsize = sizeof(struct shash_desc) + - crypto_shash_descsize(shash); - DMINFO("%s using shash \"%s\"", alg_name, driver_name); - } else { - v->ahash_tfm = ahash; - static_branch_inc(&ahash_enabled); - v->digest_size = crypto_ahash_digestsize(ahash); - v->hash_reqsize = sizeof(struct ahash_request) + - crypto_ahash_reqsize(ahash); - DMINFO("%s using ahash \"%s\"", alg_name, driver_name); + return PTR_ERR(shash); } + v->shash_tfm = shash; + v->digest_size = crypto_shash_digestsize(shash); + DMINFO("%s using \"%s\"", alg_name, crypto_shash_driver_name(shash)); if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { ti->error = "Digest size too big"; return -EINVAL; @@ -1346,7 +1277,7 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg) return -EINVAL; } } - if (v->shash_tfm) { + if (v->version) { /* Version 1: salt at beginning */ SHASH_DESC_ON_STACK(desc, v->shash_tfm); int r; @@ -1530,7 +1461,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - /* Root hash signature is a optional parameter*/ + /* Root hash signature is an optional parameter */ r = verity_verify_root_hash(root_hash_digest_to_validate, strlen(root_hash_digest_to_validate), verify_args.sig, @@ -1625,7 +1556,8 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ti->per_io_data_size = sizeof(struct dm_verity_io) + v->hash_reqsize; + ti->per_io_data_size = sizeof(struct dm_verity_io) + + crypto_shash_descsize(v->shash_tfm); r = verity_fec_ctr(v); if (r) @@ -1732,10 +1664,7 @@ static int verity_preresume(struct dm_target *ti) bdev = dm_disk(dm_table_get_md(ti->table))->part0; root_digest.digest = v->root_digest; root_digest.digest_len = v->digest_size; - if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) - root_digest.alg = crypto_ahash_alg_name(v->ahash_tfm); - else - root_digest.alg = crypto_shash_alg_name(v->shash_tfm); + root_digest.alg = crypto_shash_alg_name(v->shash_tfm); r = security_bdev_setintegrity(bdev, LSM_INT_DMVERITY_ROOTHASH, &root_digest, sizeof(root_digest)); @@ -1761,11 +1690,12 @@ static struct target_type verity_target = { .name = "verity", /* Note: the LSMs depend on the singleton and immutable features */ .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, - .version = {1, 10, 0}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, .map = verity_map, + .postsuspend = verity_postsuspend, .status = verity_status, .prepare_ioctl = verity_prepare_ioctl, .iterate_devices = verity_iterate_devices, diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c index a9e2c6c0a33c..d5261a0e4232 100644 --- a/drivers/md/dm-verity-verify-sig.c +++ b/drivers/md/dm-verity-verify-sig.c @@ -71,9 +71,14 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, const char *arg_name) { struct dm_target *ti = v->ti; - int ret = 0; + int ret; const char *sig_key = NULL; + if (v->signature_key_desc) { + ti->error = DM_VERITY_VERIFY_ERR("root_hash_sig_key_desc already specified"); + return -EINVAL; + } + if (!*argc) { ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified"); return -EINVAL; @@ -83,14 +88,18 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, (*argc)--; ret = verity_verify_get_sig_from_key(sig_key, sig_opts); - if (ret < 0) + if (ret < 0) { ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified"); + return ret; + } v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL); - if (!v->signature_key_desc) + if (!v->signature_key_desc) { + ti->error = DM_VERITY_VERIFY_ERR("Could not allocate memory for signature key"); return -ENOMEM; + } - return ret; + return 0; } /* diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 8cbb57862ae1..6d141abd965c 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -39,11 +39,10 @@ struct dm_verity { struct dm_target *ti; struct dm_bufio_client *bufio; char *alg_name; - struct crypto_ahash *ahash_tfm; /* either this or shash_tfm is set */ - struct crypto_shash *shash_tfm; /* either this or ahash_tfm is set */ + struct crypto_shash *shash_tfm; u8 *root_digest; /* digest of the root block */ u8 *salt; /* salt: its size is salt_size */ - u8 *initial_hashstate; /* salted initial state, if shash_tfm is set */ + u8 *initial_hashstate; /* salted initial state, if version >= 1 */ u8 *zero_digest; /* digest for a zero block */ #ifdef CONFIG_SECURITY u8 *root_digest_sig; /* signature of the root digest */ @@ -61,7 +60,6 @@ struct dm_verity { bool hash_failed:1; /* set if hash of any block failed */ bool use_bh_wq:1; /* try to verify in BH wq before normal work-queue */ unsigned int digest_size; /* digest size for the current hash algorithm */ - unsigned int hash_reqsize; /* the size of temporary space for crypto */ enum verity_mode mode; /* mode for handling verification errors */ enum verity_mode error_mode;/* mode for handling I/O errors */ unsigned int corrupted_errs;/* Number of errors for corrupted blocks */ @@ -100,19 +98,13 @@ struct dm_verity_io { u8 want_digest[HASH_MAX_DIGESTSIZE]; /* - * This struct is followed by a variable-sized hash request of size - * v->hash_reqsize, either a struct ahash_request or a struct shash_desc - * (depending on whether ahash_tfm or shash_tfm is being used). To - * access it, use verity_io_hash_req(). + * Temporary space for hashing. This is variable-length and must be at + * the end of the struct. struct shash_desc is just the fixed part; + * it's followed by a context of size crypto_shash_descsize(shash_tfm). */ + struct shash_desc hash_desc; }; -static inline void *verity_io_hash_req(struct dm_verity *v, - struct dm_verity_io *io) -{ - return io + 1; -} - static inline u8 *verity_io_real_digest(struct dm_verity *v, struct dm_verity_io *io) { @@ -126,7 +118,7 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v, } extern int verity_hash(struct dm_verity *v, struct dm_verity_io *io, - const u8 *data, size_t len, u8 *digest, bool may_sleep); + const u8 *data, size_t len, u8 *digest); extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, sector_t block, u8 *digest, bool *is_zero); diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 7ce8847b3404..d8de4a3076a1 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -13,7 +13,6 @@ #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> #include <linux/dax.h> -#include <linux/pfn_t.h> #include <linux/libnvdimm.h> #include <linux/delay.h> #include "dm-io-tracker.h" @@ -256,7 +255,7 @@ static int persistent_memory_claim(struct dm_writecache *wc) int r; loff_t s; long p, da; - pfn_t pfn; + unsigned long pfn; int id; struct page **pages; sector_t offset; @@ -290,7 +289,7 @@ static int persistent_memory_claim(struct dm_writecache *wc) r = da; goto err2; } - if (!pfn_t_has_page(pfn)) { + if (!pfn_valid(pfn)) { wc->memory_map = NULL; r = -EOPNOTSUPP; goto err2; @@ -314,13 +313,13 @@ static int persistent_memory_claim(struct dm_writecache *wc) r = daa ? daa : -EINVAL; goto err3; } - if (!pfn_t_has_page(pfn)) { + if (!pfn_valid(pfn)) { r = -EOPNOTSUPP; goto err3; } while (daa-- && i < p) { - pages[i++] = pfn_t_to_page(pfn); - pfn.val++; + pages[i++] = pfn_to_page(pfn); + pfn++; if (!(i & 15)) cond_resched(); } @@ -706,7 +705,7 @@ static inline void writecache_verify_watermark(struct dm_writecache *wc) static void writecache_max_age_timer(struct timer_list *t) { - struct dm_writecache *wc = from_timer(wc, t, max_age_timer); + struct dm_writecache *wc = timer_container_of(wc, t, max_age_timer); if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { queue_work(wc->writeback_wq, &wc->writeback_work); @@ -797,7 +796,7 @@ static void writecache_flush(struct dm_writecache *wc) bool need_flush_after_free; wc->uncommitted_blocks = 0; - del_timer(&wc->autocommit_timer); + timer_delete(&wc->autocommit_timer); if (list_empty(&wc->lru)) return; @@ -866,7 +865,7 @@ static void writecache_flush_work(struct work_struct *work) static void writecache_autocommit_timer(struct timer_list *t) { - struct dm_writecache *wc = from_timer(wc, t, autocommit_timer); + struct dm_writecache *wc = timer_container_of(wc, t, autocommit_timer); if (!writecache_has_error(wc)) queue_work(wc->writeback_wq, &wc->flush_work); @@ -927,8 +926,8 @@ static void writecache_suspend(struct dm_target *ti) struct dm_writecache *wc = ti->private; bool flush_on_suspend; - del_timer_sync(&wc->autocommit_timer); - del_timer_sync(&wc->max_age_timer); + timer_delete_sync(&wc->autocommit_timer); + timer_delete_sync(&wc->max_age_timer); wc_lock(wc); writecache_flush(wc); diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 20edd3fabbab..5a840c4ae316 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -17,33 +17,26 @@ * For internal zone reports bypassing the top BIO submission path. */ static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t, - sector_t sector, unsigned int nr_zones, - report_zones_cb cb, void *data) + unsigned int nr_zones, + struct dm_report_zones_args *args) { - struct gendisk *disk = md->disk; - int ret; - struct dm_report_zones_args args = { - .next_sector = sector, - .orig_data = data, - .orig_cb = cb, - }; - do { struct dm_target *tgt; + int ret; - tgt = dm_table_find_target(t, args.next_sector); + tgt = dm_table_find_target(t, args->next_sector); if (WARN_ON_ONCE(!tgt->type->report_zones)) return -EIO; - args.tgt = tgt; - ret = tgt->type->report_zones(tgt, &args, - nr_zones - args.zone_idx); + args->tgt = tgt; + ret = tgt->type->report_zones(tgt, args, + nr_zones - args->zone_idx); if (ret < 0) return ret; - } while (args.zone_idx < nr_zones && - args.next_sector < get_capacity(disk)); + } while (args->zone_idx < nr_zones && + args->next_sector < get_capacity(md->disk)); - return args.zone_idx; + return args->zone_idx; } /* @@ -52,28 +45,41 @@ static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t, * generally implemented by targets using dm_report_zones(). */ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, + struct blk_report_zones_args *args) { struct mapped_device *md = disk->private_data; struct dm_table *map; - int srcu_idx, ret; + struct dm_table *zone_revalidate_map = md->zone_revalidate_map; + int srcu_idx, ret = -EIO; + bool put_table = false; - if (!md->zone_revalidate_map) { - /* Regular user context */ + if (!zone_revalidate_map || md->revalidate_map_task != current) { + /* + * Regular user context or + * Zone revalidation during __bind() is in progress, but this + * call is from a different process + */ if (dm_suspended_md(md)) return -EAGAIN; map = dm_get_live_table(md, &srcu_idx); - if (!map) - return -EIO; + put_table = true; } else { /* Zone revalidation during __bind() */ - map = md->zone_revalidate_map; + map = zone_revalidate_map; } - ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); + if (map) { + struct dm_report_zones_args dm_args = { + .disk = md->disk, + .next_sector = sector, + .rep_args = args, + }; + ret = dm_blk_do_report_zones(md, map, nr_zones, &dm_args); + } - if (!md->zone_revalidate_map) + if (put_table) dm_put_live_table(md, srcu_idx); return ret; @@ -106,7 +112,18 @@ static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, } args->next_sector = zone->start + zone->len; - return args->orig_cb(zone, args->zone_idx++, args->orig_data); + + /* If we have an internal callback, call it first. */ + if (args->cb) { + int ret; + + ret = args->cb(zone, args->zone_idx, args->data); + if (ret) + return ret; + } + + return disk_report_zone(args->disk, zone, args->zone_idx++, + args->rep_args); } /* @@ -153,33 +170,36 @@ int dm_revalidate_zones(struct dm_table *t, struct request_queue *q) { struct mapped_device *md = t->md; struct gendisk *disk = md->disk; + unsigned int nr_zones = disk->nr_zones; int ret; if (!get_capacity(disk)) return 0; - /* Revalidate only if something changed. */ - if (!disk->nr_zones || disk->nr_zones != md->nr_zones) { - DMINFO("%s using %s zone append", - disk->disk_name, - queue_emulates_zone_append(q) ? "emulated" : "native"); - md->nr_zones = 0; - } - - if (md->nr_zones) + /* + * Do not revalidate if zone write plug resources have already + * been allocated. + */ + if (dm_has_zone_plugs(md)) return 0; + DMINFO("%s using %s zone append", disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); + /* * Our table is not live yet. So the call to dm_get_live_table() * in dm_blk_report_zones() will fail. Set a temporary pointer to * our table for dm_blk_report_zones() to use directly. */ md->zone_revalidate_map = t; + md->revalidate_map_task = current; ret = blk_revalidate_disk_zones(disk); + md->revalidate_map_task = NULL; md->zone_revalidate_map = NULL; if (ret) { DMERR("Revalidate zones failed %d", ret); + disk->nr_zones = nr_zones; return ret; } @@ -337,15 +357,15 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, /* * Check if zone append is natively supported, and if not, set the - * mapped device queue as needing zone append emulation. + * mapped device queue as needing zone append emulation. If zone + * append is natively supported, make sure that + * max_hw_zone_append_sectors is not set to 0. */ WARN_ON_ONCE(queue_is_mq(q)); - if (dm_table_supports_zone_append(t)) { - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - } else { - set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + if (!dm_table_supports_zone_append(t)) lim->max_hw_zone_append_sectors = 0; - } + else if (lim->max_hw_zone_append_sectors == 0) + lim->max_hw_zone_append_sectors = lim->max_zone_append_sectors; /* * Determine the max open and max active zone limits for the mapped @@ -380,15 +400,28 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, lim->max_open_zones = 0; lim->max_active_zones = 0; lim->max_hw_zone_append_sectors = 0; + lim->max_zone_append_sectors = 0; lim->zone_write_granularity = 0; lim->chunk_sectors = 0; lim->features &= ~BLK_FEAT_ZONED; - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - md->nr_zones = 0; - disk->nr_zones = 0; return 0; } + if (get_capacity(disk) && dm_has_zone_plugs(t->md)) { + if (q->limits.chunk_sectors != lim->chunk_sectors) { + DMWARN("%s: device has zone write plug resources. " + "Cannot change zone size", + disk->disk_name); + return -EINVAL; + } + if (lim->max_hw_zone_append_sectors != 0 && + !dm_table_is_wildcard(t)) { + DMWARN("%s: device has zone write plug resources. " + "New table must emulate zone append", + disk->disk_name); + return -EINVAL; + } + } /* * Warn once (when the capacity is not yet set) if the mapped device is * partially using zone resources of the target devices as that leads to @@ -408,6 +441,23 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, return 0; } +void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim) +{ + struct mapped_device *md = t->md; + + if (lim->features & BLK_FEAT_ZONED) { + if (dm_table_supports_zone_append(t)) + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + else + set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + } else { + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + md->nr_zones = 0; + md->disk->nr_zones = 0; + } +} + + /* * IO completion callback called from clone_endio(). */ @@ -423,12 +473,10 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) */ if (clone->bi_status == BLK_STS_OK && bio_op(clone) == REQ_OP_ZONE_APPEND) { - sector_t mask = bdev_zone_sectors(disk->part0) - 1; - - orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask; + orig_bio->bi_iter.bi_sector += + bdev_offset_from_zone_start(disk->part0, + clone->bi_iter.bi_sector); } - - return; } static int dm_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, @@ -454,10 +502,15 @@ int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, sector_t sector, unsigned int nr_zones, unsigned long *need_reset) { + struct dm_report_zones_args args = { + .disk = md->disk, + .next_sector = sector, + .cb = dm_zone_need_reset_cb, + .data = need_reset, + }; int ret; - ret = dm_blk_do_report_zones(md, t, sector, nr_zones, - dm_zone_need_reset_cb, need_reset); + ret = dm_blk_do_report_zones(md, t, nr_zones, &args); if (ret != nr_zones) { DMERR("Get %s zone reset bitmap failed\n", md->disk->disk_name); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index d58db9a27e6c..76e2c6868548 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -76,9 +76,9 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, * pointer and the requested position. */ nr_blocks = block - wp_block; - ret = blkdev_issue_zeroout(dev->bdev, - dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), - dmz_blk2sect(nr_blocks), GFP_NOIO, 0); + ret = blk_zone_issue_zeroout(dev->bdev, + dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), + dmz_blk2sect(nr_blocks), GFP_NOIO); if (ret) { dmz_dev_err(dev, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 6141fc25d842..9da329078ea4 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1015,7 +1015,8 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) /* * Pass on ioctl to the backend device. */ -static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, bool *forward) { struct dmz_target *dmz = ti->private; struct dmz_dev *dev = &dmz->dev[0]; @@ -1061,7 +1062,7 @@ static int dmz_iterate_devices(struct dm_target *ti, struct dmz_target *dmz = ti->private; unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); sector_t capacity; - int i, r; + int i, r = 0; for (i = 0; i < dmz->nr_ddevs; i++) { capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 12ecf07a3841..6c83ab940af7 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -403,15 +403,16 @@ static void do_deferred_remove(struct work_struct *w) dm_deferred_remove(); } -static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int dm_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mapped_device *md = bdev->bd_disk->private_data; + struct mapped_device *md = disk->private_data; return dm_get_geometry(md, geo); } static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, - struct block_device **bdev) + struct block_device **bdev, unsigned int cmd, + unsigned long arg, bool *forward) { struct dm_target *ti; struct dm_table *map; @@ -434,8 +435,8 @@ retry: if (dm_suspended_md(md)) return -EAGAIN; - r = ti->type->prepare_ioctl(ti, bdev); - if (r == -ENOTCONN && !fatal_signal_pending(current)) { + r = ti->type->prepare_ioctl(ti, bdev, cmd, arg, forward); + if (r == -ENOTCONN && *forward && !fatal_signal_pending(current)) { dm_put_live_table(md, *srcu_idx); fsleep(10000); goto retry; @@ -454,9 +455,10 @@ static int dm_blk_ioctl(struct block_device *bdev, blk_mode_t mode, { struct mapped_device *md = bdev->bd_disk->private_data; int r, srcu_idx; + bool forward = true; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); - if (r < 0) + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, cmd, arg, &forward); + if (!forward || r < 0) goto out; if (r > 0) { @@ -488,18 +490,13 @@ u64 dm_start_time_ns_from_clone(struct bio *bio) } EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); -static inline bool bio_is_flush_with_data(struct bio *bio) -{ - return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); -} - static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio) { /* * If REQ_PREFLUSH set, don't account payload, it will be * submitted (and accounted) after this flush completes. */ - if (bio_is_flush_with_data(bio)) + if (io->requeue_flush_with_data) return 0; if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT))) return io->sectors; @@ -588,6 +585,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio, gfp_t g io = container_of(tio, struct dm_io, tio); io->magic = DM_IO_MAGIC; io->status = BLK_STS_OK; + io->requeue_flush_with_data = false; /* one ref is for submission, the other is for completion */ atomic_set(&io->io_count, 2); @@ -946,6 +944,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage) struct mapped_device *md = io->md; blk_status_t io_error; bool requeued; + bool requeue_flush_with_data; requeued = dm_handle_requeue(io, first_stage); if (requeued && first_stage) @@ -962,6 +961,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage) __dm_start_io_acct(io); dm_end_io_acct(io); } + requeue_flush_with_data = io->requeue_flush_with_data; free_io(io); smp_wmb(); this_cpu_dec(*md->pending_io); @@ -974,7 +974,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage) if (requeued) return; - if (bio_is_flush_with_data(bio)) { + if (unlikely(requeue_flush_with_data)) { /* * Preflush done for flush with data, reissue * without REQ_PREFLUSH. @@ -1022,10 +1022,8 @@ static void dm_wq_requeue_work(struct work_struct *work) * * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io. */ -static void dm_io_complete(struct dm_io *io) +static inline void dm_io_complete(struct dm_io *io) { - bool first_requeue; - /* * Only dm_io that has been split needs two stage requeue, otherwise * we may run into long bio clone chain during suspend and OOM could @@ -1034,12 +1032,7 @@ static void dm_io_complete(struct dm_io *io) * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they * also aren't handled via the first stage requeue. */ - if (dm_io_flagged(io, DM_IO_WAS_SPLIT)) - first_requeue = true; - else - first_requeue = false; - - __dm_io_complete(io, first_requeue); + __dm_io_complete(io, dm_io_flagged(io, DM_IO_WAS_SPLIT)); } /* @@ -1082,22 +1075,6 @@ static inline struct queue_limits *dm_get_queue_limits(struct mapped_device *md) return &md->queue->limits; } -void disable_discard(struct mapped_device *md) -{ - struct queue_limits *limits = dm_get_queue_limits(md); - - /* device doesn't really support DISCARD, disable it */ - limits->max_hw_discard_sectors = 0; -} - -void disable_write_zeroes(struct mapped_device *md) -{ - struct queue_limits *limits = dm_get_queue_limits(md); - - /* device doesn't really support WRITE ZEROES, disable it */ - limits->max_write_zeroes_sectors = 0; -} - static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) { return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); @@ -1115,10 +1092,10 @@ static void clone_endio(struct bio *bio) if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_DISCARD && !bdev_max_discard_sectors(bio->bi_bdev)) - disable_discard(md); + blk_queue_disable_discard(md->queue); else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && !bdev_write_zeroes_sectors(bio->bi_bdev)) - disable_write_zeroes(md); + blk_queue_disable_write_zeroes(md->queue); } if (static_branch_unlikely(&zoned_enabled) && @@ -1232,7 +1209,7 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, - pfn_t *pfn) + unsigned long *pfn) { struct mapped_device *md = dax_get_private(dax_dev); sector_t sector = pgoff * PAGE_SECTORS; @@ -1307,8 +1284,9 @@ out: /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management - * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by - * __send_duplicate_bios(). + * operations, zone append writes (native with REQ_OP_ZONE_APPEND or emulated + * with write BIOs flagged with BIO_EMULATES_ZONE_APPEND) and any bio serviced + * by __send_duplicate_bios(). * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be @@ -1341,11 +1319,19 @@ void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors) unsigned int bio_sectors = bio_sectors(bio); BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); - BUG_ON(op_is_zone_mgmt(bio_op(bio))); - BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); BUG_ON(bio_sectors > *tio->len_ptr); BUG_ON(n_sectors > bio_sectors); + if (static_branch_unlikely(&zoned_enabled) && + unlikely(bdev_is_zoned(bio->bi_bdev))) { + enum req_op op = bio_op(bio); + + BUG_ON(op_is_zone_mgmt(op)); + BUG_ON(op == REQ_OP_WRITE); + BUG_ON(op == REQ_OP_WRITE_ZEROES); + BUG_ON(op == REQ_OP_ZONE_APPEND); + } + *tio->len_ptr -= bio_sectors - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; @@ -1479,12 +1465,12 @@ static void setup_split_accounting(struct clone_info *ci, unsigned int len) static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, struct dm_target *ti, unsigned int num_bios, - unsigned *len, gfp_t gfp_flag) + unsigned *len) { struct bio *bio; - int try = (gfp_flag & GFP_NOWAIT) ? 0 : 1; + int try; - for (; try < 2; try++) { + for (try = 0; try < 2; try++) { int bio_nr; if (try && num_bios > 1) @@ -1508,8 +1494,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, } static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, - unsigned int num_bios, unsigned int *len, - gfp_t gfp_flag) + unsigned int num_bios, unsigned int *len) { struct bio_list blist = BIO_EMPTY_LIST; struct bio *clone; @@ -1526,7 +1511,7 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe * Using alloc_multiple_bios(), even if num_bios is 1, to consistently * support allocating using GFP_NOWAIT with GFP_NOIO fallback. */ - alloc_multiple_bios(&blist, ci, ti, num_bios, len, gfp_flag); + alloc_multiple_bios(&blist, ci, ti, num_bios, len); while ((clone = bio_list_pop(&blist))) { if (num_bios > 1) dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); @@ -1541,14 +1526,18 @@ static void __send_empty_flush(struct clone_info *ci) { struct dm_table *t = ci->map; struct bio flush_bio; + blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + + if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) == + (REQ_IDLE | REQ_SYNC)) + opf |= REQ_IDLE; /* * Use an on-stack bio for this, it's safe since we don't * need to reference it after submit. It's just used as * the basis for the clone(s). */ - bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, - REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); + bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf); ci->bio = &flush_bio; ci->sector_count = 0; @@ -1564,7 +1553,7 @@ static void __send_empty_flush(struct clone_info *ci) atomic_add(ti->num_flush_bios, &ci->io->io_count); bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, - NULL, GFP_NOWAIT); + NULL); atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); } } else { @@ -1612,7 +1601,7 @@ static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti, __max_io_len(ti, ci->sector, max_granularity, max_sectors)); atomic_add(num_bios, &ci->io->io_count); - bios = __send_duplicate_bios(ci, ti, num_bios, &len, GFP_NOIO); + bios = __send_duplicate_bios(ci, ti, num_bios, &len); /* * alloc_io() takes one extra reference for submission, so the * reference won't reach 0 without the following (+1) subtraction @@ -1746,6 +1735,9 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci) ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED); len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); + if (ci->bio->bi_opf & REQ_ATOMIC && len != ci->sector_count) + return BLK_STS_IOERR; + setup_split_accounting(ci, len); if (unlikely(ci->bio->bi_opf & REQ_NOWAIT)) { @@ -1784,19 +1776,35 @@ static void init_clone_info(struct clone_info *ci, struct dm_io *io, } #ifdef CONFIG_BLK_DEV_ZONED -static inline bool dm_zone_bio_needs_split(struct mapped_device *md, - struct bio *bio) +static inline bool dm_zone_bio_needs_split(struct bio *bio) { /* - * For mapped device that need zone append emulation, we must - * split any large BIO that straddles zone boundaries. + * Special case the zone operations that cannot or should not be split. */ - return dm_emulate_zone_append(md) && bio_straddles_zones(bio) && - !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); + switch (bio_op(bio)) { + case REQ_OP_ZONE_APPEND: + case REQ_OP_ZONE_FINISH: + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + return false; + default: + break; + } + + /* + * When mapped devices use the block layer zone write plugging, we must + * split any large BIO to the mapped device limits to not submit BIOs + * that span zone boundaries and to avoid potential deadlocks with + * queue freeze operations. + */ + return bio_needs_zone_write_plugging(bio) || bio_straddles_zones(bio); } + static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) { - return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0); + if (!bio_needs_zone_write_plugging(bio)) + return false; + return blk_zone_plug_bio(bio, 0); } static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci, @@ -1849,7 +1857,7 @@ static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci, * not go crazy with the clone allocation. */ alloc_multiple_bios(&blist, ci, ti, min(nr_reset, 32), - NULL, GFP_NOIO); + NULL); } /* Get a clone and change it to a regular reset operation. */ @@ -1881,7 +1889,7 @@ static void __send_zone_reset_all_native(struct clone_info *ci, unsigned int bios; atomic_add(1, &ci->io->io_count); - bios = __send_duplicate_bios(ci, ti, 1, NULL, GFP_NOIO); + bios = __send_duplicate_bios(ci, ti, 1, NULL); atomic_sub(1 - bios, &ci->io->io_count); ci->sector_count = 0; @@ -1912,8 +1920,7 @@ static blk_status_t __send_zone_reset_all(struct clone_info *ci) } #else -static inline bool dm_zone_bio_needs_split(struct mapped_device *md, - struct bio *bio) +static inline bool dm_zone_bio_needs_split(struct bio *bio) { return false; } @@ -1940,9 +1947,7 @@ static void dm_split_and_process_bio(struct mapped_device *md, is_abnormal = is_abnormal_io(bio); if (static_branch_unlikely(&zoned_enabled)) { - /* Special case REQ_OP_ZONE_RESET_ALL as it cannot be split. */ - need_split = (bio_op(bio) != REQ_OP_ZONE_RESET_ALL) && - (is_abnormal || dm_zone_bio_needs_split(md, bio)); + need_split = is_abnormal || dm_zone_bio_needs_split(bio); } else { need_split = is_abnormal; } @@ -1969,6 +1974,15 @@ static void dm_split_and_process_bio(struct mapped_device *md, /* Only support nowait for normal IO */ if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) { + /* + * Don't support NOWAIT for FLUSH because it may allocate + * multiple bios and there's no easy way how to undo the + * allocations. + */ + if (bio->bi_opf & REQ_PREFLUSH) { + bio_wouldblock_error(bio); + return; + } io = alloc_io(md, bio, GFP_NOWAIT); if (unlikely(!io)) { /* Unable to do anything without dm_io. */ @@ -1980,12 +1994,30 @@ static void dm_split_and_process_bio(struct mapped_device *md, } init_clone_info(&ci, io, map, bio, is_abnormal); - if (bio->bi_opf & REQ_PREFLUSH) { + if (unlikely((bio->bi_opf & REQ_PREFLUSH) != 0)) { + /* + * The "flush_bypasses_map" is set on targets where it is safe + * to skip the map function and submit bios directly to the + * underlying block devices - currently, it is set for dm-linear + * and dm-stripe. + * + * If we have just one underlying device (i.e. there is one + * linear target or multiple linear targets pointing to the same + * device), we can send the flush with data directly to it. + */ + if (bio->bi_iter.bi_size && map->flush_bypasses_map) { + struct list_head *devices = dm_table_get_devices(map); + if (devices->next == devices->prev) + goto send_preflush_with_data; + } + if (bio->bi_iter.bi_size) + io->requeue_flush_with_data = true; __send_empty_flush(&ci); /* dm_io_complete submits any data associated with flush */ goto out; } +send_preflush_with_data: if (static_branch_unlikely(&zoned_enabled) && (bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) { error = __send_zone_reset_all(&ci); @@ -2406,21 +2438,35 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct queue_limits *limits) { struct dm_table *old_map; - sector_t size; + sector_t size, old_size; int ret; lockdep_assert_held(&md->suspend_lock); size = dm_table_get_size(t); + old_size = dm_get_size(md); + + if (!dm_table_supports_size_change(t, old_size, size)) { + old_map = ERR_PTR(-EINVAL); + goto out; + } + + set_capacity(md->disk, size); + + ret = dm_table_set_restrictions(t, md->queue, limits); + if (ret) { + set_capacity(md->disk, old_size); + old_map = ERR_PTR(ret); + goto out; + } + /* * Wipe any geometry if the size of the table changed. */ - if (size != dm_get_size(md)) + if (size != old_size) memset(&md->geometry, 0, sizeof(md->geometry)); - set_capacity(md->disk, size); - dm_table_event_callback(t, event_callback, md); if (dm_table_request_based(t)) { @@ -2438,10 +2484,10 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * requests in the queue may refer to bio from the old bioset, * so you must walk through the queue to unprep. */ - if (!md->mempools) { + if (!md->mempools) md->mempools = t->mempools; - t->mempools = NULL; - } + else + dm_free_md_mempools(t->mempools); } else { /* * The md may already have mempools that need changing. @@ -2450,14 +2496,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, */ dm_free_md_mempools(md->mempools); md->mempools = t->mempools; - t->mempools = NULL; - } - - ret = dm_table_set_restrictions(t, md->queue, limits); - if (ret) { - old_map = ERR_PTR(ret); - goto out; } + t->mempools = NULL; old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); rcu_assign_pointer(md->map, (void *)t); @@ -2884,7 +2924,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, { bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; - int r; + int r = 0; lockdep_assert_held(&md->suspend_lock); @@ -2936,8 +2976,10 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * Stop md->queue before flushing md->wq in case request-based * dm defers requests to md->wq from md->queue. */ - if (dm_request_based(md)) + if (map && dm_request_based(md)) { dm_stop_queue(md->queue); + set_bit(DMF_QUEUE_STOPPED, &md->flags); + } flush_workqueue(md->wq); @@ -2946,7 +2988,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * We call dm_wait_for_completion to wait for all existing requests * to finish. */ - r = dm_wait_for_completion(md, task_state); + if (map) + r = dm_wait_for_completion(md, task_state); if (!r) set_bit(dmf_suspended_flag, &md->flags); @@ -2959,7 +3002,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, if (r < 0) { dm_queue_flush(md); - if (dm_request_based(md)) + if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags)) dm_start_queue(md->queue); unlock_fs(md); @@ -3043,7 +3086,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map) * so that mapping of targets can work correctly. * Request-based dm is queueing the deferred I/Os in its request_queue. */ - if (dm_request_based(md)) + if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags)) dm_start_queue(md->queue); unlock_fs(md); @@ -3623,10 +3666,13 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; int r, srcu_idx; + bool forward = true; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); + /* Not a real ioctl, but targets must not interpret non-DM ioctls */ + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, 0, 0, &forward); if (r < 0) goto out; + WARN_ON_ONCE(!forward); ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_clear) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a0a8ff119815..7a795979ec72 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -58,6 +58,7 @@ void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); bool dm_table_has_no_data_devices(struct dm_table *table); +bool dm_table_is_wildcard(struct dm_table *t); int dm_calculate_queue_limits(struct dm_table *table, struct queue_limits *limits); int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, @@ -72,6 +73,8 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); struct dm_target *dm_table_get_immutable_target(struct dm_table *t); struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); +bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size, + sector_t new_size); void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); @@ -102,20 +105,24 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *lim); int dm_revalidate_zones(struct dm_table *t, struct request_queue *q); +void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, sector_t sector, unsigned int nr_zones, unsigned long *need_reset); +#define dm_has_zone_plugs(md) ((md)->disk->zone_wplugs_hash != NULL) #else #define dm_blk_report_zones NULL static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) { return false; } +#define dm_has_zone_plugs(md) false #endif /* diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index b2a00f213c2c..4b80165afd23 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -49,6 +49,7 @@ static int md_setup_ents __initdata; * instead of just one. -- KTK * 18May2000: Added support for persistent-superblock arrays: * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n * md=n,device-list reads a RAID superblock from the devices * elements in device-list are read by name_to_kdev_t so can be * a hex number or something like /dev/hda1 /dev/sdb @@ -87,7 +88,7 @@ static int __init md_setup(char *str) md_setup_ents++; switch (get_option(&str, &level)) { /* RAID level */ case 2: /* could be 0 or -1.. */ - if (level == 0) { + if (level == 0 || level == LEVEL_LINEAR) { if (get_option(&str, &factor) != 2 || /* Chunk Size */ get_option(&str, &fault) != 2) { printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); @@ -95,7 +96,10 @@ static int __init md_setup(char *str) } md_setup_args[ent].level = level; md_setup_args[ent].chunk = 1 << (factor+12); - pername = "raid0"; + if (level == LEVEL_LINEAR) + pername = "linear"; + else + pername = "raid0"; break; } fallthrough; diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c3a42dd66ce5..84b7e2af6dba 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -29,17 +29,10 @@ #include <linux/buffer_head.h> #include <linux/seq_file.h> #include <trace/events/block.h> + #include "md.h" #include "md-bitmap.h" - -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - * Version 5 is currently set only for clustered devices - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_CLUSTERED 5 -#define BITMAP_MAJOR_HOSTENDIAN 3 +#include "md-cluster.h" /* * in-memory bitmap: @@ -103,9 +96,19 @@ * */ +typedef __u16 bitmap_counter_t; + #define PAGE_BITS (PAGE_SIZE << 3) #define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) + #define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) #define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) #define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) @@ -212,6 +215,8 @@ struct bitmap { int cluster_slot; }; +static struct workqueue_struct *md_bitmap_wq; + static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, bool init); @@ -220,20 +225,19 @@ static inline char *bmname(struct bitmap *bitmap) return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } -static bool __bitmap_enabled(struct bitmap *bitmap) +static bool bitmap_enabled(void *data, bool flush) { - return bitmap->storage.filemap && - !test_bit(BITMAP_STALE, &bitmap->flags); -} - -static bool bitmap_enabled(struct mddev *mddev) -{ - struct bitmap *bitmap = mddev->bitmap; + struct bitmap *bitmap = data; - if (!bitmap) - return false; + if (!flush) + return true; - return __bitmap_enabled(bitmap); + /* + * If caller want to flush bitmap pages to underlying disks, check if + * there are cached pages in filemap. + */ + return !test_bit(BITMAP_STALE, &bitmap->flags) && + bitmap->storage.filemap != NULL; } /* @@ -426,8 +430,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; - unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) << - PAGE_SHIFT; + unsigned long num_pages = bitmap->storage.file_pages; + unsigned int bitmap_limit = (num_pages - pg_index % num_pages) << PAGE_SHIFT; loff_t sboff, offset = mddev->bitmap_info.offset; sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE; unsigned int size = PAGE_SIZE; @@ -436,7 +440,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; /* we compare length (page numbers), not page offset. */ - if ((pg_index - store->sb_index) == store->file_pages - 1) { + if ((pg_index - store->sb_index) == num_pages - 1) { unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); if (last_page_size == 0) @@ -472,7 +476,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return -EINVAL; } - md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page); + md_write_metadata(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), + page, 0); return 0; } @@ -682,7 +687,7 @@ static void bitmap_update_sb(void *data) return; if (!bitmap->storage.sb_page) /* no superblock */ return; - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); sb->events = cpu_to_le64(bitmap->mddev->events); if (bitmap->mddev->events < bitmap->events_cleared) /* rocking back to read-only */ @@ -702,7 +707,7 @@ static void bitmap_update_sb(void *data) sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes); sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> bitmap_info.space); - kunmap_atomic(sb); + kunmap_local(sb); if (bitmap->storage.file) write_file_page(bitmap, bitmap->storage.sb_page, 1); @@ -717,7 +722,7 @@ static void bitmap_print_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->storage.sb_page) return; - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); pr_debug(" version: %u\n", le32_to_cpu(sb->version)); @@ -736,7 +741,7 @@ static void bitmap_print_sb(struct bitmap *bitmap) pr_debug(" sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind)); - kunmap_atomic(sb); + kunmap_local(sb); } /* @@ -760,7 +765,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) return -ENOMEM; bitmap->storage.sb_index = 0; - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); sb->magic = cpu_to_le32(BITMAP_MAGIC); sb->version = cpu_to_le32(BITMAP_MAJOR_HI); @@ -768,7 +773,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) chunksize = bitmap->mddev->bitmap_info.chunksize; BUG_ON(!chunksize); if (!is_power_of_2(chunksize)) { - kunmap_atomic(sb); + kunmap_local(sb); pr_warn("bitmap chunksize not a power of 2\n"); return -EINVAL; } @@ -787,7 +792,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) * is a good choice? We choose COUNTER_MAX / 2 arbitrarily. */ write_behind = bitmap->mddev->bitmap_info.max_write_behind; - if (write_behind > COUNTER_MAX) + if (write_behind > COUNTER_MAX / 2) write_behind = COUNTER_MAX / 2; sb->write_behind = cpu_to_le32(write_behind); bitmap->mddev->bitmap_info.max_write_behind = write_behind; @@ -803,7 +808,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) sb->events_cleared = cpu_to_le64(bitmap->mddev->events); bitmap->mddev->bitmap_info.nodes = 0; - kunmap_atomic(sb); + kunmap_local(sb); return 0; } @@ -865,7 +870,7 @@ re_read: return err; err = -EINVAL; - sb = kmap_atomic(sb_page); + sb = kmap_local_page(sb_page); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; @@ -932,7 +937,7 @@ re_read: err = 0; out: - kunmap_atomic(sb); + kunmap_local(sb); if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { /* Assigning chunksize is required for "re_read" */ bitmap->mddev->bitmap_info.chunksize = chunksize; @@ -942,7 +947,7 @@ out: bmname(bitmap), err); goto out_no_sb; } - bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); + bitmap->cluster_slot = bitmap->mddev->cluster_ops->slot_number(bitmap->mddev); goto re_read; } @@ -1161,12 +1166,12 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) bit = file_page_offset(&bitmap->storage, chunk); /* set the bit */ - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set_bit(bit, kaddr); else set_bit_le(bit, kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); pr_debug("set file bit %lu page %lu\n", bit, index); /* record page number so it gets flushed to disk when unplug occurs */ set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY); @@ -1190,12 +1195,12 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) if (!page) return; bit = file_page_offset(&bitmap->storage, chunk); - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) clear_bit(bit, paddr); else clear_bit_le(bit, paddr); - kunmap_atomic(paddr); + kunmap_local(paddr); if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) { set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING); bitmap->allclean = 0; @@ -1214,12 +1219,12 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) if (!page) return -EINVAL; bit = file_page_offset(&bitmap->storage, chunk); - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set = test_bit(bit, paddr); else set = test_bit_le(bit, paddr); - kunmap_atomic(paddr); + kunmap_local(paddr); return set; } @@ -1232,7 +1237,7 @@ static void __bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!__bitmap_enabled(bitmap)) + if (!bitmap_enabled(bitmap, true)) return; /* look at each page to see if there are any set bits that need to be @@ -1388,9 +1393,9 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) * If the bitmap is out of date, dirty the whole page * and write it out */ - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); memset(paddr + offset, 0xff, PAGE_SIZE - offset); - kunmap_atomic(paddr); + kunmap_local(paddr); filemap_write_page(bitmap, i, true); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) { @@ -1406,12 +1411,12 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) void *paddr; bool was_set; - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) was_set = test_bit(bit, paddr); else was_set = test_bit_le(bit, paddr); - kunmap_atomic(paddr); + kunmap_local(paddr); if (was_set) { /* if the disk bit is set, set the memory bit */ @@ -1546,10 +1551,10 @@ static void bitmap_daemon_work(struct mddev *mddev) bitmap_super_t *sb; bitmap->need_sync = 0; if (bitmap->storage.filemap) { - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - kunmap_atomic(sb); + kunmap_local(sb); set_page_attr(bitmap, 0, BITMAP_PAGE_NEEDWRITE); } @@ -1670,24 +1675,13 @@ __acquires(bitmap->lock) &(bitmap->bp[page].map[pageoff]); } -static int bitmap_startwrite(struct mddev *mddev, sector_t offset, - unsigned long sectors, bool behind) +static void bitmap_start_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) - return 0; - - if (behind) { - int bw; - atomic_inc(&bitmap->behind_writes); - bw = atomic_read(&bitmap->behind_writes); - if (bw > bitmap->behind_writes_used) - bitmap->behind_writes_used = bw; - - pr_debug("inc write-behind count %d/%lu\n", - bw, bitmap->mddev->bitmap_info.max_write_behind); - } + return; while (sectors) { sector_t blocks; @@ -1697,7 +1691,7 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset, bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); if (!bmc) { spin_unlock_irq(&bitmap->counts.lock); - return 0; + return; } if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) { @@ -1733,25 +1727,16 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset, else sectors = 0; } - return 0; } -static void bitmap_endwrite(struct mddev *mddev, sector_t offset, - unsigned long sectors, bool success, bool behind) +static void bitmap_end_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return; - if (behind) { - if (atomic_dec_and_test(&bitmap->behind_writes)) - wake_up(&bitmap->behind_wait); - pr_debug("dec write-behind count %d/%lu\n", - atomic_read(&bitmap->behind_writes), - bitmap->mddev->bitmap_info.max_write_behind); - } - while (sectors) { sector_t blocks; unsigned long flags; @@ -1764,15 +1749,16 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset, return; } - if (success && !bitmap->mddev->degraded && - bitmap->events_cleared < bitmap->mddev->events) { - bitmap->events_cleared = bitmap->mddev->events; - bitmap->need_sync = 1; - sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); - } - - if (!success && !NEEDED(*bmc)) + if (!bitmap->mddev->degraded) { + if (bitmap->events_cleared < bitmap->mddev->events) { + bitmap->events_cleared = bitmap->mddev->events; + bitmap->need_sync = 1; + sysfs_notify_dirent_safe( + bitmap->sysfs_can_clear); + } + } else if (!NEEDED(*bmc)) { *bmc |= NEEDED_MASK; + } if (COUNTER(*bmc) == COUNTER_MAX) wake_up(&bitmap->overflow_wait); @@ -1795,15 +1781,9 @@ static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, bool degraded) { bitmap_counter_t *bmc; - bool rv; + bool rv = false; - if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ - *blocks = 1024; - return true; /* always resync if no bitmap */ - } spin_lock_irq(&bitmap->counts.lock); - - rv = false; bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc) { /* locked */ @@ -1852,10 +1832,6 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset, bitmap_counter_t *bmc; unsigned long flags; - if (bitmap == NULL) { - *blocks = 1024; - return; - } spin_lock_irqsave(&bitmap->counts.lock, flags); bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc == NULL) @@ -1994,12 +1970,12 @@ static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s, md_bitmap_set_memory_bits(bitmap, sec, 1); md_bitmap_file_set_bit(bitmap, sec); - if (sec < bitmap->mddev->recovery_cp) + if (sec < bitmap->mddev->resync_offset) /* We are asserting that the array is dirty, - * so move the recovery_cp address back so + * so move the resync_offset address back so * that it is obvious that it is dirty */ - bitmap->mddev->recovery_cp = sec; + bitmap->mddev->resync_offset = sec; } } @@ -2039,7 +2015,7 @@ static void md_bitmap_free(void *data) sysfs_put(bitmap->sysfs_can_clear); if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && - bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) + bitmap->cluster_slot == bitmap->mddev->cluster_ops->slot_number(bitmap->mddev)) md_cluster_stop(bitmap->mddev); /* Shouldn't be needed - but just in case.... */ @@ -2062,6 +2038,31 @@ static void md_bitmap_free(void *data) kfree(bitmap); } +static void bitmap_start_behind_write(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + int bw; + + atomic_inc(&bitmap->behind_writes); + bw = atomic_read(&bitmap->behind_writes); + if (bw > bitmap->behind_writes_used) + bitmap->behind_writes_used = bw; + + pr_debug("inc write-behind count %d/%lu\n", + bw, bitmap->mddev->bitmap_info.max_write_behind); +} + +static void bitmap_end_behind_write(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (atomic_dec_and_test(&bitmap->behind_writes)) + wake_up(&bitmap->behind_wait); + pr_debug("dec write-behind count %d/%lu\n", + atomic_read(&bitmap->behind_writes), + bitmap->mddev->bitmap_info.max_write_behind); +} + static void bitmap_wait_behind_writes(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; @@ -2190,9 +2191,9 @@ static struct bitmap *__bitmap_create(struct mddev *mddev, int slot) return ERR_PTR(err); } -static int bitmap_create(struct mddev *mddev, int slot) +static int bitmap_create(struct mddev *mddev) { - struct bitmap *bitmap = __bitmap_create(mddev, slot); + struct bitmap *bitmap = __bitmap_create(mddev, -1); if (IS_ERR(bitmap)) return PTR_ERR(bitmap); @@ -2216,7 +2217,7 @@ static int bitmap_load(struct mddev *mddev) mddev_create_serial_pool(mddev, rdev); if (mddev_is_clustered(mddev)) - md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); + mddev->cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); /* Clear out old bitmap info first: Either there is none, or we * are resuming after someone else has possibly changed things, @@ -2234,7 +2235,7 @@ static int bitmap_load(struct mddev *mddev) || bitmap->events_cleared == mddev->events) /* no need to keep dirty bits to optimise a * re-add of a missing device */ - start = mddev->recovery_cp; + start = mddev->resync_offset; mutex_lock(&mddev->bitmap_info.mutex); err = md_bitmap_init_from_disk(bitmap, start); @@ -2342,7 +2343,8 @@ static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats) if (!bitmap) return -ENOENT; - + if (!bitmap->storage.sb_page) + return -EINVAL; sb = kmap_local_page(bitmap->storage.sb_page); stats->sync_size = le64_to_cpu(sb->sync_size); kunmap_local(sb); @@ -2568,15 +2570,14 @@ err: return ret; } -static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, - bool init) +static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return 0; - return __bitmap_resize(bitmap, blocks, chunksize, init); + return __bitmap_resize(bitmap, blocks, chunksize, false); } static ssize_t @@ -2653,7 +2654,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } mddev->bitmap_info.offset = offset; - rv = bitmap_create(mddev, -1); + rv = bitmap_create(mddev); if (rv) goto out; @@ -2965,12 +2966,19 @@ static struct attribute *md_bitmap_attrs[] = { &max_backlog_used.attr, NULL }; -const struct attribute_group md_bitmap_group = { + +static struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; static struct bitmap_operations bitmap_ops = { + .head = { + .type = MD_BITMAP, + .id = ID_BITMAP, + .name = "bitmap", + }, + .enabled = bitmap_enabled, .create = bitmap_create, .resize = bitmap_resize, @@ -2981,10 +2989,16 @@ static struct bitmap_operations bitmap_ops = { .dirty_bits = bitmap_dirty_bits, .unplug = bitmap_unplug, .daemon_work = bitmap_daemon_work, + + .start_behind_write = bitmap_start_behind_write, + .end_behind_write = bitmap_end_behind_write, .wait_behind_writes = bitmap_wait_behind_writes, - .startwrite = bitmap_startwrite, - .endwrite = bitmap_endwrite, + .start_write = bitmap_start_write, + .end_write = bitmap_end_write, + .start_discard = bitmap_start_write, + .end_discard = bitmap_end_write, + .start_sync = bitmap_start_sync, .end_sync = bitmap_end_sync, .cond_end_sync = bitmap_cond_end_sync, @@ -2998,9 +3012,22 @@ static struct bitmap_operations bitmap_ops = { .copy_from_slot = bitmap_copy_from_slot, .set_pages = bitmap_set_pages, .free = md_bitmap_free, + + .group = &md_bitmap_group, }; -void mddev_set_bitmap_ops(struct mddev *mddev) +int md_bitmap_init(void) +{ + md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, + 0); + if (!md_bitmap_wq) + return -ENOMEM; + + return register_md_submodule(&bitmap_ops.head); +} + +void md_bitmap_exit(void) { - mddev->bitmap_ops = &bitmap_ops; + destroy_workqueue(md_bitmap_wq); + unregister_md_submodule(&bitmap_ops.head); } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 662e6fc141a7..b42a28fa83a0 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -9,19 +9,26 @@ #define BITMAP_MAGIC 0x6d746962 -typedef __u16 bitmap_counter_t; -#define COUNTER_BITS 16 -#define COUNTER_BIT_SHIFT 4 -#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) - -#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) -#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) -#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +/* + * version 3 is host-endian order, this is deprecated and not used for new + * array + */ +#define BITMAP_MAJOR_LO 3 +#define BITMAP_MAJOR_HOSTENDIAN 3 +/* version 4 is little-endian order, the default value */ +#define BITMAP_MAJOR_HI 4 +/* version 5 is only used for cluster */ +#define BITMAP_MAJOR_CLUSTERED 5 +/* version 6 is only used for lockless bitmap */ +#define BITMAP_MAJOR_LOCKLESS 6 /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { - BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ + BITMAP_FIRST_USE = 3, /* llbitmap is just created */ + BITMAP_CLEAN = 4, /* llbitmap is created with assume_clean */ + BITMAP_DAEMON_BUSY = 5, /* llbitmap daemon is not finished after daemon_sleep */ BITMAP_HOSTENDIAN =15, }; @@ -70,11 +77,15 @@ struct md_bitmap_stats { struct file *file; }; +typedef void (md_bitmap_fn)(struct mddev *mddev, sector_t offset, + unsigned long sectors); + struct bitmap_operations { - bool (*enabled)(struct mddev *mddev); - int (*create)(struct mddev *mddev, int slot); - int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, - bool init); + struct md_submodule_head head; + + bool (*enabled)(void *data, bool flush); + int (*create)(struct mddev *mddev); + int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize); int (*load)(struct mddev *mddev); void (*destroy)(struct mddev *mddev); @@ -84,12 +95,18 @@ struct bitmap_operations { unsigned long e); void (*unplug)(struct mddev *mddev, bool sync); void (*daemon_work)(struct mddev *mddev); + + void (*start_behind_write)(struct mddev *mddev); + void (*end_behind_write)(struct mddev *mddev); void (*wait_behind_writes)(struct mddev *mddev); - int (*startwrite)(struct mddev *mddev, sector_t offset, - unsigned long sectors, bool behind); - void (*endwrite)(struct mddev *mddev, sector_t offset, - unsigned long sectors, bool success, bool behind); + md_bitmap_fn *start_write; + md_bitmap_fn *end_write; + md_bitmap_fn *start_discard; + md_bitmap_fn *end_discard; + + sector_t (*skip_sync_blocks)(struct mddev *mddev, sector_t offset); + bool (*blocks_synced)(struct mddev *mddev, sector_t offset); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); @@ -107,9 +124,75 @@ struct bitmap_operations { sector_t *hi, bool clear_bits); void (*set_pages)(void *data, unsigned long pages); void (*free)(void *data); + + struct attribute_group *group; }; /* the bitmap API */ -void mddev_set_bitmap_ops(struct mddev *mddev); +static inline bool md_bitmap_registered(struct mddev *mddev) +{ + return mddev->bitmap_ops != NULL; +} + +static inline bool md_bitmap_enabled(struct mddev *mddev, bool flush) +{ + /* bitmap_ops must be registered before creating bitmap. */ + if (!md_bitmap_registered(mddev)) + return false; + + if (!mddev->bitmap) + return false; + + return mddev->bitmap_ops->enabled(mddev->bitmap, flush); +} + +static inline bool md_bitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) +{ + /* always resync if no bitmap */ + if (!md_bitmap_enabled(mddev, false)) { + *blocks = 1024; + return true; + } + + return mddev->bitmap_ops->start_sync(mddev, offset, blocks, degraded); +} + +static inline void md_bitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + if (!md_bitmap_enabled(mddev, false)) { + *blocks = 1024; + return; + } + + mddev->bitmap_ops->end_sync(mddev, offset, blocks); +} + +#ifdef CONFIG_MD_BITMAP +int md_bitmap_init(void); +void md_bitmap_exit(void); +#else +static inline int md_bitmap_init(void) +{ + return 0; +} +static inline void md_bitmap_exit(void) +{ +} +#endif + +#ifdef CONFIG_MD_LLBITMAP +int md_llbitmap_init(void); +void md_llbitmap_exit(void); +#else +static inline int md_llbitmap_init(void) +{ + return 0; +} +static inline void md_llbitmap_exit(void) +{ +} +#endif #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 6595f89becdb..11f1e91d387d 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -337,11 +337,11 @@ static void recover_bitmaps(struct md_thread *thread) md_wakeup_thread(mddev->sync_thread); if (hi > 0) { - if (lo < mddev->recovery_cp) - mddev->recovery_cp = lo; + if (lo < mddev->resync_offset) + mddev->resync_offset = lo; /* wake up thread to continue resync in case resync * is not finished */ - if (mddev->recovery_cp != MaxSector) { + if (mddev->resync_offset != MaxSector) { /* * clear the REMOTE flag since we will launch * resync thread in current node. @@ -630,7 +630,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) ret = mddev->bitmap_ops->resize(mddev, le64_to_cpu(msg->high), - 0, false); + 0); break; default: ret = -1; @@ -863,9 +863,9 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) lockres_free(bm_lockres); continue; } - if ((hi > 0) && (lo < mddev->recovery_cp)) { + if ((hi > 0) && (lo < mddev->resync_offset)) { set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - mddev->recovery_cp = lo; + mddev->resync_offset = lo; md_check_recovery(mddev); } @@ -979,7 +979,7 @@ err: lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); if (cinfo->lockspace) - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); mddev->cluster_info = NULL; kfree(cinfo); return ret; @@ -1027,7 +1027,7 @@ static int leave(struct mddev *mddev) * Also, we should send BITMAP_NEEDS_SYNC message in * case reshaping is interrupted. */ - if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) || + if ((cinfo->slot_number > 0 && mddev->resync_offset != MaxSector) || (mddev->reshape_position != MaxSector && test_bit(MD_CLOSING, &mddev->flags))) resync_bitmap(mddev); @@ -1042,7 +1042,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); unlock_all_bitmaps(mddev); - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); kfree(cinfo); return 0; } @@ -1166,7 +1166,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz struct dlm_lock_resource *bm_lockres; char str[64]; - if (i == md_cluster_ops->slot_number(mddev)) + if (i == slot_number(mddev)) continue; bitmap = mddev->bitmap_ops->get_from_slot(mddev, i); @@ -1216,7 +1216,7 @@ out: */ static int cluster_check_sync_size(struct mddev *mddev) { - int current_slot = md_cluster_ops->slot_number(mddev); + int current_slot = slot_number(mddev); int node_num = mddev->bitmap_info.nodes; struct dlm_lock_resource *bm_lockres; struct md_bitmap_stats stats; @@ -1605,14 +1605,21 @@ static int gather_bitmaps(struct md_rdev *rdev) pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); goto out; } - if ((hi > 0) && (lo < mddev->recovery_cp)) - mddev->recovery_cp = lo; + if ((hi > 0) && (lo < mddev->resync_offset)) + mddev->resync_offset = lo; } out: return err; } -static const struct md_cluster_operations cluster_ops = { +static struct md_cluster_operations cluster_ops = { + .head = { + .type = MD_CLUSTER, + .id = ID_CLUSTER, + .name = "cluster", + .owner = THIS_MODULE, + }, + .join = join, .leave = leave, .slot_number = slot_number, @@ -1642,13 +1649,12 @@ static int __init cluster_init(void) { pr_warn("md-cluster: support raid1 and raid10 (limited support)\n"); pr_info("Registering Cluster MD functions\n"); - register_md_cluster_operations(&cluster_ops, THIS_MODULE); - return 0; + return register_md_submodule(&cluster_ops.head); } static void cluster_exit(void) { - unregister_md_cluster_operations(); + unregister_md_submodule(&cluster_ops.head); } module_init(cluster_init); diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 470bf18ffde5..8fb06d853173 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -10,6 +10,8 @@ struct mddev; struct md_rdev; struct md_cluster_operations { + struct md_submodule_head head; + int (*join)(struct mddev *mddev, int nodes); int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); @@ -35,4 +37,8 @@ struct md_cluster_operations { void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors); }; +extern int md_setup_cluster(struct mddev *mddev, int nodes); +extern void md_cluster_stop(struct mddev *mddev); +extern void md_reload_sb(struct mddev *mddev, int raid_disk); + #endif /* _MD_CLUSTER_H */ diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c new file mode 100644 index 000000000000..8d7b82c4a723 --- /dev/null +++ b/drivers/md/md-linear.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc + * ZYNGIER <zyngier@ufr-info-p7.ibp.fr> or <maz@gloups.fdn.fr> + */ + +#include <linux/blkdev.h> +#include <linux/seq_file.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <trace/events/block.h> +#include "md.h" + +struct dev_info { + struct md_rdev *rdev; + sector_t end_sector; +}; + +struct linear_conf { + struct rcu_head rcu; + sector_t array_sectors; + /* a copy of mddev->raid_disks */ + int raid_disks; + struct dev_info disks[] __counted_by(raid_disks); +}; + +/* + * find which device holds a particular offset + */ +static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) +{ + int lo, mid, hi; + struct linear_conf *conf; + + lo = 0; + hi = mddev->raid_disks - 1; + conf = mddev->private; + + /* + * Binary Search + */ + + while (hi > lo) { + + mid = (hi + lo) / 2; + if (sector < conf->disks[mid].end_sector) + hi = mid; + else + lo = mid + 1; + } + + return conf->disks + lo; +} + +static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + struct linear_conf *conf; + sector_t array_sectors; + + conf = mddev->private; + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + array_sectors = conf->array_sectors; + + return array_sectors; +} + +static int linear_set_limits(struct mddev *mddev) +{ + struct queue_limits lim; + int err; + + md_init_stacking_limits(&lim); + lim.max_hw_sectors = mddev->chunk_sectors; + lim.logical_block_size = mddev->logical_block_size; + lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; + lim.io_min = mddev->chunk_sectors << 9; + lim.features |= BLK_FEAT_ATOMIC_WRITES; + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) + return err; + + return queue_limits_set(mddev->gendisk->queue, &lim); +} + +static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) +{ + struct linear_conf *conf; + struct md_rdev *rdev; + int ret = -EINVAL; + int cnt; + int i; + + conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL); + if (!conf) + return ERR_PTR(-ENOMEM); + + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + + cnt = 0; + conf->array_sectors = 0; + + rdev_for_each(rdev, mddev) { + int j = rdev->raid_disk; + struct dev_info *disk = conf->disks + j; + sector_t sectors; + + if (j < 0 || j >= raid_disks || disk->rdev) { + pr_warn("md/linear:%s: disk numbering problem. Aborting!\n", + mdname(mddev)); + goto out; + } + + disk->rdev = rdev; + if (mddev->chunk_sectors) { + sectors = rdev->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev->sectors = sectors * mddev->chunk_sectors; + } + + conf->array_sectors += rdev->sectors; + cnt++; + } + if (cnt != raid_disks) { + pr_warn("md/linear:%s: not enough drives present. Aborting!\n", + mdname(mddev)); + goto out; + } + + /* + * Here we calculate the device offsets. + */ + conf->disks[0].end_sector = conf->disks[0].rdev->sectors; + + for (i = 1; i < raid_disks; i++) + conf->disks[i].end_sector = + conf->disks[i-1].end_sector + + conf->disks[i].rdev->sectors; + + if (!mddev_is_dm(mddev)) { + ret = linear_set_limits(mddev); + if (ret) + goto out; + } + + return conf; + +out: + kfree(conf); + return ERR_PTR(ret); +} + +static int linear_run(struct mddev *mddev) +{ + struct linear_conf *conf; + int ret; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; + + conf = linear_conf(mddev, mddev->raid_disks); + if (IS_ERR(conf)) + return PTR_ERR(conf); + + mddev->private = conf; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); + + ret = md_integrity_register(mddev); + if (ret) { + kfree(conf); + mddev->private = NULL; + } + return ret; +} + +static int linear_add(struct mddev *mddev, struct md_rdev *rdev) +{ + /* Adding a drive to a linear array allows the array to grow. + * It is permitted if the new drive has a matching superblock + * already on it, with raid_disk equal to raid_disks. + * It is achieved by creating a new linear_private_data structure + * and swapping it in in-place of the current one. + * The current one is never freed until the array is stopped. + * This avoids races. + */ + struct linear_conf *newconf, *oldconf; + + if (rdev->saved_raid_disk != mddev->raid_disks) + return -EINVAL; + + rdev->raid_disk = rdev->saved_raid_disk; + rdev->saved_raid_disk = -1; + + newconf = linear_conf(mddev, mddev->raid_disks + 1); + if (IS_ERR(newconf)) + return PTR_ERR(newconf); + + /* newconf->raid_disks already keeps a copy of * the increased + * value of mddev->raid_disks, WARN_ONCE() is just used to make + * sure of this. It is possible that oldconf is still referenced + * in linear_congested(), therefore kfree_rcu() is used to free + * oldconf until no one uses it anymore. + */ + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held(&mddev->reconfig_mutex)); + mddev->raid_disks++; + WARN_ONCE(mddev->raid_disks != newconf->raid_disks, + "copied raid_disks doesn't match mddev->raid_disks"); + rcu_assign_pointer(mddev->private, newconf); + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); + kfree_rcu(oldconf, rcu); + return 0; +} + +static void linear_free(struct mddev *mddev, void *priv) +{ + struct linear_conf *conf = priv; + + kfree(conf); +} + +static bool linear_make_request(struct mddev *mddev, struct bio *bio) +{ + struct dev_info *tmp_dev; + sector_t start_sector, end_sector, data_offset; + sector_t bio_sector = bio->bi_iter.bi_sector; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) + return true; + + tmp_dev = which_dev(mddev, bio_sector); + start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; + end_sector = tmp_dev->end_sector; + data_offset = tmp_dev->rdev->data_offset; + + if (unlikely(bio_sector >= end_sector || + bio_sector < start_sector)) + goto out_of_bounds; + + if (unlikely(is_rdev_broken(tmp_dev->rdev))) { + md_error(mddev, tmp_dev->rdev); + bio_io_error(bio); + return true; + } + + if (unlikely(bio_end_sector(bio) > end_sector)) { + /* This bio crosses a device boundary, so we have to split it */ + bio = bio_submit_split_bioset(bio, end_sector - bio_sector, + &mddev->bio_set); + if (!bio) + return true; + } + + md_account_bio(mddev, &bio); + bio_set_dev(bio, tmp_dev->rdev->bdev); + bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - + start_sector + data_offset; + + if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && + !bdev_max_discard_sectors(bio->bi_bdev))) { + /* Just ignore it */ + bio_endio(bio); + } else { + if (mddev->gendisk) + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), + bio_sector); + mddev_check_write_zeroes(mddev, bio); + submit_bio_noacct(bio); + } + return true; + +out_of_bounds: + pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n", + mdname(mddev), + (unsigned long long)bio->bi_iter.bi_sector, + tmp_dev->rdev->bdev, + (unsigned long long)tmp_dev->rdev->sectors, + (unsigned long long)start_sector); + bio_io_error(bio); + return true; +} + +static void linear_status(struct seq_file *seq, struct mddev *mddev) +{ + seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); +} + +static void linear_error(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) { + char *md_name = mdname(mddev); + + pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n", + md_name, rdev->bdev); + } +} + +static void linear_quiesce(struct mddev *mddev, int state) +{ +} + +static struct md_personality linear_personality = { + .head = { + .type = MD_PERSONALITY, + .id = ID_LINEAR, + .name = "linear", + .owner = THIS_MODULE, + }, + + .make_request = linear_make_request, + .run = linear_run, + .free = linear_free, + .status = linear_status, + .hot_add_disk = linear_add, + .size = linear_size, + .quiesce = linear_quiesce, + .error_handler = linear_error, +}; + +static int __init linear_init(void) +{ + return register_md_submodule(&linear_personality.head); +} + +static void linear_exit(void) +{ + unregister_md_submodule(&linear_personality.head); +} + +module_init(linear_init); +module_exit(linear_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)"); +MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ +MODULE_ALIAS("md-linear"); +MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c new file mode 100644 index 000000000000..9c1ade19b774 --- /dev/null +++ b/drivers/md/md-llbitmap.c @@ -0,0 +1,1626 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/timer.h> +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/file.h> +#include <linux/seq_file.h> +#include <trace/events/block.h> + +#include "md.h" +#include "md-bitmap.h" + +/* + * #### Background + * + * Redundant data is used to enhance data fault tolerance, and the storage + * methods for redundant data vary depending on the RAID levels. And it's + * important to maintain the consistency of redundant data. + * + * Bitmap is used to record which data blocks have been synchronized and which + * ones need to be resynchronized or recovered. Each bit in the bitmap + * represents a segment of data in the array. When a bit is set, it indicates + * that the multiple redundant copies of that data segment may not be + * consistent. Data synchronization can be performed based on the bitmap after + * power failure or readding a disk. If there is no bitmap, a full disk + * synchronization is required. + * + * #### Key Features + * + * - IO fastpath is lockless, if user issues lots of write IO to the same + * bitmap bit in a short time, only the first write has additional overhead + * to update bitmap bit, no additional overhead for the following writes; + * - support only resync or recover written data, means in the case creating + * new array or replacing with a new disk, there is no need to do a full disk + * resync/recovery; + * + * #### Key Concept + * + * ##### State Machine + * + * Each bit is one byte, contain 6 different states, see llbitmap_state. And + * there are total 8 different actions, see llbitmap_action, can change state: + * + * llbitmap state machine: transitions between states + * + * | | Startwrite | Startsync | Endsync | Abortsync| + * | --------- | ---------- | --------- | ------- | ------- | + * | Unwritten | Dirty | x | x | x | + * | Clean | Dirty | x | x | x | + * | Dirty | x | x | x | x | + * | NeedSync | x | Syncing | x | x | + * | Syncing | x | Syncing | Dirty | NeedSync | + * + * | | Reload | Daemon | Discard | Stale | + * | --------- | -------- | ------ | --------- | --------- | + * | Unwritten | x | x | x | x | + * | Clean | x | x | Unwritten | NeedSync | + * | Dirty | NeedSync | Clean | Unwritten | NeedSync | + * | NeedSync | x | x | Unwritten | x | + * | Syncing | NeedSync | x | Unwritten | NeedSync | + * + * Typical scenarios: + * + * 1) Create new array + * All bits will be set to Unwritten by default, if --assume-clean is set, + * all bits will be set to Clean instead. + * + * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and + * rely on xor data + * + * 2.1) write new data to raid1/raid10: + * Unwritten --StartWrite--> Dirty + * + * 2.2) write new data to raid456: + * Unwritten --StartWrite--> NeedSync + * + * Because the initial recover for raid456 is skipped, the xor data is not built + * yet, the bit must be set to NeedSync first and after lazy initial recover is + * finished, the bit will finally set to Dirty(see 5.1 and 5.4); + * + * 2.3) cover write + * Clean --StartWrite--> Dirty + * + * 3) daemon, if the array is not degraded: + * Dirty --Daemon--> Clean + * + * 4) discard + * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten + * + * 5) resync and recover + * + * 5.1) common process + * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean + * + * 5.2) resync after power failure + * Dirty --Reload--> NeedSync + * + * 5.3) recover while replacing with a new disk + * By default, the old bitmap framework will recover all data, and llbitmap + * implements this by a new helper, see llbitmap_skip_sync_blocks: + * + * skip recover for bits other than dirty or clean; + * + * 5.4) lazy initial recover for raid5: + * By default, the old bitmap framework will only allow new recover when there + * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added + * to perform raid456 lazy recover for set bits(from 2.2). + * + * 6. special handling for degraded array: + * + * - Dirty bits will never be cleared, daemon will just do nothing, so that if + * a disk is readded, Clean bits can be skipped with recovery; + * - Dirty bits will convert to Syncing from start write, to do data recovery + * for new added disks; + * - New write will convert bits to NeedSync directly; + * + * ##### Bitmap IO + * + * ##### Chunksize + * + * The default bitmap size is 128k, incluing 1k bitmap super block, and + * the default size of segment of data in the array each bit(chunksize) is 64k, + * and chunksize will adjust to twice the old size each time if the total number + * bits is not less than 127k.(see llbitmap_init) + * + * ##### READ + * + * While creating bitmap, all pages will be allocated and read for llbitmap, + * there won't be read afterwards + * + * ##### WRITE + * + * WRITE IO is divided into logical_block_size of the array, the dirty state + * of each block is tracked independently, for example: + * + * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit; + * + * | page0 | page1 | ... | page 31 | + * | | + * | \-----------------------\ + * | | + * | block0 | block1 | ... | block 8| + * | | + * | \-----------------\ + * | | + * | bit0 | bit1 | ... | bit511 | + * + * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding + * subpage will be marked dirty, such block must write first before the IO is + * issued. This behaviour will affect IO performance, to reduce the impact, if + * multiple bits are changed in the same block in a short time, all bits in this + * block will be changed to Dirty/NeedSync, so that there won't be any overhead + * until daemon clears dirty bits. + * + * ##### Dirty Bits synchronization + * + * IO fast path will set bits to dirty, and those dirty bits will be cleared + * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between + * IO path and daemon; + * + * IO path: + * 1) try to grab a reference, if succeed, set expire time after 5s and return; + * 2) if failed to grab a reference, wait for daemon to finish clearing dirty + * bits; + * + * Daemon (Daemon will be woken up every daemon_sleep seconds): + * For each page: + * 1) check if page expired, if not skip this page; for expired page: + * 2) suspend the page and wait for inflight write IO to be done; + * 3) change dirty page to clean; + * 4) resume the page; + */ + +#define BITMAP_DATA_OFFSET 1024 + +/* 64k is the max IO size of sync IO for raid1/raid10 */ +#define MIN_CHUNK_SIZE (64 * 2) + +/* By default, daemon will be woken up every 30s */ +#define DEFAULT_DAEMON_SLEEP 30 + +/* + * Dirtied bits that have not been accessed for more than 5s will be cleared + * by daemon. + */ +#define DEFAULT_BARRIER_IDLE 5 + +enum llbitmap_state { + /* No valid data, init state after assemble the array */ + BitUnwritten = 0, + /* data is consistent */ + BitClean, + /* data will be consistent after IO is done, set directly for writes */ + BitDirty, + /* + * data need to be resynchronized: + * 1) set directly for writes if array is degraded, prevent full disk + * synchronization after readding a disk; + * 2) reassemble the array after power failure, and dirty bits are + * found after reloading the bitmap; + * 3) set for first write for raid5, to build initial xor data lazily + */ + BitNeedSync, + /* data is synchronizing */ + BitSyncing, + BitStateCount, + BitNone = 0xff, +}; + +enum llbitmap_action { + /* User write new data, this is the only action from IO fast path */ + BitmapActionStartwrite = 0, + /* Start recovery */ + BitmapActionStartsync, + /* Finish recovery */ + BitmapActionEndsync, + /* Failed recovery */ + BitmapActionAbortsync, + /* Reassemble the array */ + BitmapActionReload, + /* Daemon thread is trying to clear dirty bits */ + BitmapActionDaemon, + /* Data is deleted */ + BitmapActionDiscard, + /* + * Bitmap is stale, mark all bits in addition to BitUnwritten to + * BitNeedSync. + */ + BitmapActionStale, + BitmapActionCount, + /* Init state is BitUnwritten */ + BitmapActionInit, +}; + +enum llbitmap_page_state { + LLPageFlush = 0, + LLPageDirty, +}; + +struct llbitmap_page_ctl { + char *state; + struct page *page; + unsigned long expire; + unsigned long flags; + wait_queue_head_t wait; + struct percpu_ref active; + /* Per block size dirty state, maximum 64k page / 1 sector = 128 */ + unsigned long dirty[]; +}; + +struct llbitmap { + struct mddev *mddev; + struct llbitmap_page_ctl **pctl; + + unsigned int nr_pages; + unsigned int io_size; + unsigned int blocks_per_page; + + /* shift of one chunk */ + unsigned long chunkshift; + /* size of one chunk in sector */ + unsigned long chunksize; + /* total number of chunks */ + unsigned long chunks; + unsigned long last_end_sync; + /* + * time in seconds that dirty bits will be cleared if the page is not + * accessed. + */ + unsigned long barrier_idle; + /* fires on first BitDirty state */ + struct timer_list pending_timer; + struct work_struct daemon_work; + + unsigned long flags; + __u64 events_cleared; + + /* for slow disks */ + atomic_t behind_writes; + wait_queue_head_t behind_wait; +}; + +struct llbitmap_unplug_work { + struct work_struct work; + struct llbitmap *llbitmap; + struct completion *done; +}; + +static struct workqueue_struct *md_llbitmap_io_wq; +static struct workqueue_struct *md_llbitmap_unplug_wq; + +static char state_machine[BitStateCount][BitmapActionCount] = { + [BitUnwritten] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitNone, + [BitmapActionStale] = BitNone, + }, + [BitClean] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, + [BitDirty] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNeedSync, + [BitmapActionDaemon] = BitClean, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, + [BitNeedSync] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitSyncing, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNone, + }, + [BitSyncing] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitSyncing, + [BitmapActionEndsync] = BitDirty, + [BitmapActionAbortsync] = BitNeedSync, + [BitmapActionReload] = BitNeedSync, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, +}; + +static void __llbitmap_flush(struct mddev *mddev); + +static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos) +{ + unsigned int idx; + unsigned int offset; + + pos += BITMAP_DATA_OFFSET; + idx = pos >> PAGE_SHIFT; + offset = offset_in_page(pos); + + return llbitmap->pctl[idx]->state[offset]; +} + +/* set all the bits in the subpage as dirty */ +static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, + struct llbitmap_page_ctl *pctl, + unsigned int block) +{ + bool level_456 = raid_is_456(llbitmap->mddev); + unsigned int io_size = llbitmap->io_size; + int pos; + + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { + switch (pctl->state[pos]) { + case BitUnwritten: + pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; + break; + case BitClean: + pctl->state[pos] = BitDirty; + break; + } + } +} + +static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, + int offset) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + unsigned int io_size = llbitmap->io_size; + int block = offset / io_size; + int pos; + + if (!test_bit(LLPageDirty, &pctl->flags)) + set_bit(LLPageDirty, &pctl->flags); + + /* + * For degraded array, dirty bits will never be cleared, and we must + * resync all the dirty bits, hence skip infect new dirty bits to + * prevent resync unnecessary data. + */ + if (llbitmap->mddev->degraded) { + set_bit(block, pctl->dirty); + return; + } + + /* + * The subpage usually contains a total of 512 bits. If any single bit + * within the subpage is marked as dirty, the entire sector will be + * written. To avoid impacting write performance, when multiple bits + * within the same sector are modified within llbitmap->barrier_idle, + * all bits in the sector will be collectively marked as dirty at once. + */ + if (test_and_set_bit(block, pctl->dirty)) { + llbitmap_infect_dirty_bits(llbitmap, pctl, block); + return; + } + + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { + if (pos == offset) + continue; + if (pctl->state[pos] == BitDirty || + pctl->state[pos] == BitNeedSync) { + llbitmap_infect_dirty_bits(llbitmap, pctl, block); + return; + } + } +} + +static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, + loff_t pos) +{ + unsigned int idx; + unsigned int bit; + + pos += BITMAP_DATA_OFFSET; + idx = pos >> PAGE_SHIFT; + bit = offset_in_page(pos); + + llbitmap->pctl[idx]->state[bit] = state; + if (state == BitDirty || state == BitNeedSync) + llbitmap_set_page_dirty(llbitmap, idx, bit); +} + +static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) +{ + struct mddev *mddev = llbitmap->mddev; + struct page *page = NULL; + struct md_rdev *rdev; + + if (llbitmap->pctl && llbitmap->pctl[idx]) + page = llbitmap->pctl[idx]->page; + if (page) + return page; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return ERR_PTR(-ENOMEM); + + rdev_for_each(rdev, mddev) { + sector_t sector; + + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + sector = mddev->bitmap_info.offset + + (idx << PAGE_SECTORS_SHIFT); + + if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ, + true)) + return page; + + md_error(mddev, rdev); + } + + __free_page(page); + return ERR_PTR(-EIO); +} + +static void llbitmap_write_page(struct llbitmap *llbitmap, int idx) +{ + struct page *page = llbitmap->pctl[idx]->page; + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + int block; + + for (block = 0; block < llbitmap->blocks_per_page; block++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + + if (!test_and_clear_bit(block, pctl->dirty)) + continue; + + rdev_for_each(rdev, mddev) { + sector_t sector; + sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT; + + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + sector = mddev->bitmap_info.offset + rdev->sb_start + + (idx << PAGE_SECTORS_SHIFT) + + block * bit_sector; + md_write_metadata(mddev, rdev, sector, + llbitmap->io_size, page, + block * llbitmap->io_size); + } + } +} + +static void active_release(struct percpu_ref *ref) +{ + struct llbitmap_page_ctl *pctl = + container_of(ref, struct llbitmap_page_ctl, active); + + wake_up(&pctl->wait); +} + +static void llbitmap_free_pages(struct llbitmap *llbitmap) +{ + int i; + + if (!llbitmap->pctl) + return; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + if (!pctl || !pctl->page) + break; + + __free_page(pctl->page); + percpu_ref_exit(&pctl->active); + } + + kfree(llbitmap->pctl[0]); + kfree(llbitmap->pctl); + llbitmap->pctl = NULL; +} + +static int llbitmap_cache_pages(struct llbitmap *llbitmap) +{ + struct llbitmap_page_ctl *pctl; + unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks + + BITMAP_DATA_OFFSET, PAGE_SIZE); + unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS( + llbitmap->blocks_per_page)); + int i; + + llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *), + GFP_KERNEL | __GFP_ZERO); + if (!llbitmap->pctl) + return -ENOMEM; + + size = round_up(size, cache_line_size()); + pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO); + if (!pctl) { + kfree(llbitmap->pctl); + return -ENOMEM; + } + + llbitmap->nr_pages = nr_pages; + + for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) { + struct page *page = llbitmap_read_page(llbitmap, i); + + llbitmap->pctl[i] = pctl; + + if (IS_ERR(page)) { + llbitmap_free_pages(llbitmap); + return PTR_ERR(page); + } + + if (percpu_ref_init(&pctl->active, active_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + __free_page(page); + llbitmap_free_pages(llbitmap); + return -ENOMEM; + } + + pctl->page = page; + pctl->state = page_address(page); + init_waitqueue_head(&pctl->wait); + } + + return 0; +} + +static void llbitmap_init_state(struct llbitmap *llbitmap) +{ + enum llbitmap_state state = BitUnwritten; + unsigned long i; + + if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) + state = BitClean; + + for (i = 0; i < llbitmap->chunks; i++) + llbitmap_write(llbitmap, state, i); +} + +/* The return value is only used from resync, where @start == @end. */ +static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, + unsigned long start, + unsigned long end, + enum llbitmap_action action) +{ + struct mddev *mddev = llbitmap->mddev; + enum llbitmap_state state = BitNone; + bool level_456 = raid_is_456(llbitmap->mddev); + bool need_resync = false; + bool need_recovery = false; + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) + return BitNone; + + if (action == BitmapActionInit) { + llbitmap_init_state(llbitmap); + return BitNone; + } + + while (start <= end) { + enum llbitmap_state c = llbitmap_read(llbitmap, start); + + if (c < 0 || c >= BitStateCount) { + pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n", + __func__, start, c, action); + state = BitNeedSync; + goto write_bitmap; + } + + if (c == BitNeedSync) + need_resync = !mddev->degraded; + + state = state_machine[c][action]; + +write_bitmap: + if (unlikely(mddev->degraded)) { + /* For degraded array, mark new data as need sync. */ + if (state == BitDirty && + action == BitmapActionStartwrite) + state = BitNeedSync; + /* + * For degraded array, resync dirty data as well, noted + * if array is still degraded after resync is done, all + * new data will still be dirty until array is clean. + */ + else if (c == BitDirty && + action == BitmapActionStartsync) + state = BitSyncing; + } else if (c == BitUnwritten && state == BitDirty && + action == BitmapActionStartwrite && level_456) { + /* Delay raid456 initial recovery to first write. */ + state = BitNeedSync; + } + + if (state == BitNone) { + start++; + continue; + } + + llbitmap_write(llbitmap, state, start); + + if (state == BitNeedSync) + need_resync = !mddev->degraded; + else if (state == BitDirty && + !timer_pending(&llbitmap->pending_timer)) + mod_timer(&llbitmap->pending_timer, + jiffies + mddev->bitmap_info.daemon_sleep * HZ); + + start++; + } + + if (need_resync && level_456) + need_recovery = true; + + if (need_recovery) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } else if (need_resync) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + + return state; +} + +static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + +retry: + if (likely(percpu_ref_tryget_live(&pctl->active))) { + WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ); + return; + } + + wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active)); + goto retry; +} + +static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + percpu_ref_put(&pctl->active); +} + +static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + percpu_ref_kill(&pctl->active); + + if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active), + llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) + return -ETIMEDOUT; + + return 0; +} + +static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + pctl->expire = LONG_MAX; + percpu_ref_resurrect(&pctl->active); + wake_up(&pctl->wait); +} + +static int llbitmap_check_support(struct mddev *mddev) +{ + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n", + mdname(mddev)); + return -EBUSY; + } + + if (mddev->bitmap_info.space == 0) { + if (mddev->bitmap_info.default_space == 0) { + pr_notice("md/llbitmap: %s: no space for bitmap\n", + mdname(mddev)); + return -ENOSPC; + } + } + + if (!mddev->persistent) { + pr_notice("md/llbitmap: %s: array must be persistent\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev->bitmap_info.file) { + pr_notice("md/llbitmap: %s: doesn't support bitmap file\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev->bitmap_info.external) { + pr_notice("md/llbitmap: %s: doesn't support external metadata\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev_is_dm(mddev)) { + pr_notice("md/llbitmap: %s: doesn't support dm-raid\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + return 0; +} + +static int llbitmap_init(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + sector_t blocks = mddev->resync_max_sectors; + unsigned long chunksize = MIN_CHUNK_SIZE; + unsigned long chunks = DIV_ROUND_UP(blocks, chunksize); + unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT; + int ret; + + while (chunks > space) { + chunksize = chunksize << 1; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + } + + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; + llbitmap->chunkshift = ffz(~chunksize); + llbitmap->chunksize = chunksize; + llbitmap->chunks = chunks; + mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP; + + ret = llbitmap_cache_pages(llbitmap); + if (ret) + return ret; + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionInit); + /* flush initial llbitmap to disk */ + __llbitmap_flush(mddev); + + return 0; +} + +static int llbitmap_read_sb(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + unsigned long daemon_sleep; + unsigned long chunksize; + unsigned long events; + struct page *sb_page; + bitmap_super_t *sb; + int ret = -EINVAL; + + if (!mddev->bitmap_info.offset) { + pr_err("md/llbitmap: %s: no super block found", mdname(mddev)); + return -EINVAL; + } + + sb_page = llbitmap_read_page(llbitmap, 0); + if (IS_ERR(sb_page)) { + pr_err("md/llbitmap: %s: read super block failed", + mdname(mddev)); + return -EIO; + } + + sb = kmap_local_page(sb_page); + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { + pr_err("md/llbitmap: %s: invalid super block magic number", + mdname(mddev)); + goto out_put_page; + } + + if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) { + pr_err("md/llbitmap: %s: invalid super block version", + mdname(mddev)); + goto out_put_page; + } + + if (memcmp(sb->uuid, mddev->uuid, 16)) { + pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n", + mdname(mddev)); + goto out_put_page; + } + + if (mddev->bitmap_info.space == 0) { + int room = le32_to_cpu(sb->sectors_reserved); + + if (room) + mddev->bitmap_info.space = room; + else + mddev->bitmap_info.space = mddev->bitmap_info.default_space; + } + llbitmap->flags = le32_to_cpu(sb->state); + if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) { + ret = llbitmap_init(llbitmap); + goto out_put_page; + } + + chunksize = le32_to_cpu(sb->chunksize); + if (!is_power_of_2(chunksize)) { + pr_err("md/llbitmap: %s: chunksize not a power of 2", + mdname(mddev)); + goto out_put_page; + } + + if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, + mddev->bitmap_info.space << SECTOR_SHIFT)) { + pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu", + mdname(mddev), chunksize, mddev->resync_max_sectors, + mddev->bitmap_info.space); + goto out_put_page; + } + + daemon_sleep = le32_to_cpu(sb->daemon_sleep); + if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) { + pr_err("md/llbitmap: %s: daemon sleep %lu period out of range", + mdname(mddev), daemon_sleep); + goto out_put_page; + } + + events = le64_to_cpu(sb->events); + if (events < mddev->events) { + pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery", + mdname(mddev), events, mddev->events); + set_bit(BITMAP_STALE, &llbitmap->flags); + } + + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); + mddev->bitmap_info.chunksize = chunksize; + mddev->bitmap_info.daemon_sleep = daemon_sleep; + + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; + llbitmap->chunksize = chunksize; + llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize); + llbitmap->chunkshift = ffz(~chunksize); + ret = llbitmap_cache_pages(llbitmap); + +out_put_page: + __free_page(sb_page); + kunmap_local(sb); + return ret; +} + +static void llbitmap_pending_timer_fn(struct timer_list *pending_timer) +{ + struct llbitmap *llbitmap = + container_of(pending_timer, struct llbitmap, pending_timer); + + if (work_busy(&llbitmap->daemon_work)) { + pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n", + mdname(llbitmap->mddev), + llbitmap->mddev->bitmap_info.daemon_sleep); + set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags); + return; + } + + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); +} + +static void md_llbitmap_daemon_fn(struct work_struct *work) +{ + struct llbitmap *llbitmap = + container_of(work, struct llbitmap, daemon_work); + unsigned long start; + unsigned long end; + bool restart; + int idx; + + if (llbitmap->mddev->degraded) + return; +retry: + start = 0; + end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1; + restart = false; + + for (idx = 0; idx < llbitmap->nr_pages; idx++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + + if (idx > 0) { + start = end + 1; + end = min(end + PAGE_SIZE, llbitmap->chunks - 1); + } + + if (!test_bit(LLPageFlush, &pctl->flags) && + time_before(jiffies, pctl->expire)) { + restart = true; + continue; + } + + if (llbitmap_suspend_timeout(llbitmap, idx) < 0) { + pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n", + mdname(llbitmap->mddev), __func__, idx); + continue; + } + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon); + llbitmap_resume(llbitmap, idx); + } + + /* + * If the daemon took a long time to finish, retry to prevent missing + * clearing dirty bits. + */ + if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags)) + goto retry; + + /* If some page is dirty but not expired, setup timer again */ + if (restart) + mod_timer(&llbitmap->pending_timer, + jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ); +} + +static int llbitmap_create(struct mddev *mddev) +{ + struct llbitmap *llbitmap; + int ret; + + ret = llbitmap_check_support(mddev); + if (ret) + return ret; + + llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL); + if (!llbitmap) + return -ENOMEM; + + llbitmap->mddev = mddev; + llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0); + llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size; + + timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0); + INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn); + atomic_set(&llbitmap->behind_writes, 0); + init_waitqueue_head(&llbitmap->behind_wait); + + mutex_lock(&mddev->bitmap_info.mutex); + mddev->bitmap = llbitmap; + ret = llbitmap_read_sb(llbitmap); + mutex_unlock(&mddev->bitmap_info.mutex); + if (ret) { + kfree(llbitmap); + mddev->bitmap = NULL; + } + + return ret; +} + +static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long chunks; + + if (chunksize == 0) + chunksize = llbitmap->chunksize; + + /* If there is enough space, leave the chunksize unchanged. */ + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) { + chunksize = chunksize << 1; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + } + + llbitmap->chunkshift = ffz(~chunksize); + llbitmap->chunksize = chunksize; + llbitmap->chunks = chunks; + + return 0; +} + +static int llbitmap_load(struct mddev *mddev) +{ + enum llbitmap_action action = BitmapActionReload; + struct llbitmap *llbitmap = mddev->bitmap; + + if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags)) + action = BitmapActionStale; + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action); + return 0; +} + +static void llbitmap_destroy(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (!llbitmap) + return; + + mutex_lock(&mddev->bitmap_info.mutex); + + timer_delete_sync(&llbitmap->pending_timer); + flush_workqueue(md_llbitmap_io_wq); + flush_workqueue(md_llbitmap_unplug_wq); + + mddev->bitmap = NULL; + llbitmap_free_pages(llbitmap); + kfree(llbitmap); + mutex_unlock(&mddev->bitmap_info.mutex); +} + +static void llbitmap_start_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = offset >> llbitmap->chunkshift; + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); + + while (page_start <= page_end) { + llbitmap_raise_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_end_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = offset >> llbitmap->chunkshift; + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + while (page_start <= page_end) { + llbitmap_release_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); + + while (page_start <= page_end) { + llbitmap_raise_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + while (page_start <= page_end) { + llbitmap_release_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_unplug_fn(struct work_struct *work) +{ + struct llbitmap_unplug_work *unplug_work = + container_of(work, struct llbitmap_unplug_work, work); + struct llbitmap *llbitmap = unplug_work->llbitmap; + struct blk_plug plug; + int i; + + blk_start_plug(&plug); + + for (i = 0; i < llbitmap->nr_pages; i++) { + if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) || + !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) + continue; + + llbitmap_write_page(llbitmap, i); + } + + blk_finish_plug(&plug); + md_super_wait(llbitmap->mddev); + complete(unplug_work->done); +} + +static bool llbitmap_dirty(struct llbitmap *llbitmap) +{ + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) + if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) + return true; + + return false; +} + +static void llbitmap_unplug(struct mddev *mddev, bool sync) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct llbitmap *llbitmap = mddev->bitmap; + struct llbitmap_unplug_work unplug_work = { + .llbitmap = llbitmap, + .done = &done, + }; + + if (!llbitmap_dirty(llbitmap)) + return; + + /* + * Issue new bitmap IO under submit_bio() context will deadlock: + * - the bio will wait for bitmap bio to be done, before it can be + * issued; + * - bitmap bio will be added to current->bio_list and wait for this + * bio to be issued; + */ + INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn); + queue_work(md_llbitmap_unplug_wq, &unplug_work.work); + wait_for_completion(&done); + destroy_work_on_stack(&unplug_work.work); +} + +/* + * Force to write all bitmap pages to disk, called when stopping the array, or + * every daemon_sleep seconds when sync_thread is running. + */ +static void __llbitmap_flush(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + struct blk_plug plug; + int i; + + blk_start_plug(&plug); + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + /* mark all blocks as dirty */ + set_bit(LLPageDirty, &pctl->flags); + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); + llbitmap_write_page(llbitmap, i); + } + blk_finish_plug(&plug); + md_super_wait(llbitmap->mddev); +} + +static void llbitmap_flush(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) + set_bit(LLPageFlush, &llbitmap->pctl[i]->flags); + + timer_delete_sync(&llbitmap->pending_timer); + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); + flush_work(&llbitmap->daemon_work); + + __llbitmap_flush(mddev); +} + +/* This is used for raid5 lazy initial recovery */ +static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + enum llbitmap_state c = llbitmap_read(llbitmap, p); + + return c == BitClean || c == BitDirty; +} + +static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + enum llbitmap_state c = llbitmap_read(llbitmap, p); + + /* always skip unwritten blocks */ + if (c == BitUnwritten) + return blocks; + + /* For degraded array, don't skip */ + if (mddev->degraded) + return 0; + + /* For resync also skip clean/dirty blocks */ + if ((c == BitClean || c == BitDirty) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + return blocks; + + return 0; +} + +static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + + /* + * Handle one bit at a time, this is much simpler. And it doesn't matter + * if md_do_sync() loop more times. + */ + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + return llbitmap_state_machine(llbitmap, p, p, + BitmapActionStartsync) == BitSyncing; +} + +/* Something is wrong, sync_thread stop at @offset */ +static void llbitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1, + BitmapActionAbortsync); +} + +/* A full sync_thread is finished */ +static void llbitmap_close_sync(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + /* let daemon_fn clear dirty bits immediately */ + WRITE_ONCE(pctl->expire, jiffies); + } + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionEndsync); +} + +/* + * sync_thread have reached @sector, update metadata every daemon_sleep seconds, + * just in case sync_thread have to restart after power failure. + */ +static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector, + bool force) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (sector == 0) { + llbitmap->last_end_sync = jiffies; + return; + } + + if (time_before(jiffies, llbitmap->last_end_sync + + HZ * mddev->bitmap_info.daemon_sleep)) + return; + + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + + mddev->curr_resync_completed = sector; + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift, + BitmapActionEndsync); + __llbitmap_flush(mddev); + + llbitmap->last_end_sync = jiffies; + sysfs_notify_dirent_safe(mddev->sysfs_completed); +} + +static bool llbitmap_enabled(void *data, bool flush) +{ + struct llbitmap *llbitmap = data; + + return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); +} + +static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s, + unsigned long e) +{ + llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite); +} + +static void llbitmap_write_sb(struct llbitmap *llbitmap) +{ + int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size); + + bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks); + llbitmap_write_page(llbitmap, 0); + md_super_wait(llbitmap->mddev); +} + +static void llbitmap_update_sb(void *data) +{ + struct llbitmap *llbitmap = data; + struct mddev *mddev = llbitmap->mddev; + struct page *sb_page; + bitmap_super_t *sb; + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) + return; + + sb_page = llbitmap_read_page(llbitmap, 0); + if (IS_ERR(sb_page)) { + pr_err("%s: %s: read super block failed", __func__, + mdname(mddev)); + set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); + return; + } + + if (mddev->events < llbitmap->events_cleared) + llbitmap->events_cleared = mddev->events; + + sb = kmap_local_page(sb_page); + sb->events = cpu_to_le64(mddev->events); + sb->state = cpu_to_le32(llbitmap->flags); + sb->chunksize = cpu_to_le32(llbitmap->chunksize); + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); + sb->events_cleared = cpu_to_le64(llbitmap->events_cleared); + sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space); + sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep); + + kunmap_local(sb); + llbitmap_write_sb(llbitmap); +} + +static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats) +{ + struct llbitmap *llbitmap = data; + + memset(stats, 0, sizeof(*stats)); + + stats->missing_pages = 0; + stats->pages = llbitmap->nr_pages; + stats->file_pages = llbitmap->nr_pages; + + stats->behind_writes = atomic_read(&llbitmap->behind_writes); + stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait); + stats->events_cleared = llbitmap->events_cleared; + + return 0; +} + +/* just flag all pages as needing to be written */ +static void llbitmap_write_all(struct mddev *mddev) +{ + int i; + struct llbitmap *llbitmap = mddev->bitmap; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + set_bit(LLPageDirty, &pctl->flags); + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); + } +} + +static void llbitmap_start_behind_write(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + atomic_inc(&llbitmap->behind_writes); +} + +static void llbitmap_end_behind_write(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (atomic_dec_and_test(&llbitmap->behind_writes)) + wake_up(&llbitmap->behind_wait); +} + +static void llbitmap_wait_behind_writes(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (!llbitmap) + return; + + wait_event(llbitmap->behind_wait, + atomic_read(&llbitmap->behind_writes) == 0); + +} + +static ssize_t bits_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap; + int bits[BitStateCount] = {0}; + loff_t start = 0; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap || !llbitmap->pctl) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "no bitmap\n"); + } + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "bitmap io error\n"); + } + + while (start < llbitmap->chunks) { + enum llbitmap_state c = llbitmap_read(llbitmap, start); + + if (c < 0 || c >= BitStateCount) + pr_err("%s: invalid bit %llu state %d\n", + __func__, start, c); + else + bits[c]++; + start++; + } + + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n", + bits[BitUnwritten], bits[BitClean], bits[BitDirty], + bits[BitNeedSync], bits[BitSyncing]); +} + +static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); + +static ssize_t metadata_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap; + ssize_t ret; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "no bitmap\n"); + } + + ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n", + llbitmap->chunksize, llbitmap->chunkshift, + llbitmap->chunks, mddev->bitmap_info.offset, + llbitmap->mddev->bitmap_info.daemon_sleep); + mutex_unlock(&mddev->bitmap_info.mutex); + + return ret; +} + +static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata); + +static ssize_t +daemon_sleep_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep); +} + +static ssize_t +daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long timeout; + int rv = kstrtoul(buf, 10, &timeout); + + if (rv) + return rv; + + mddev->bitmap_info.daemon_sleep = timeout; + return len; +} + +static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep); + +static ssize_t +barrier_idle_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + return sprintf(page, "%lu\n", llbitmap->barrier_idle); +} + +static ssize_t +barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long timeout; + int rv = kstrtoul(buf, 10, &timeout); + + if (rv) + return rv; + + llbitmap->barrier_idle = timeout; + return len; +} + +static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); + +static struct attribute *md_llbitmap_attrs[] = { + &llbitmap_bits.attr, + &llbitmap_metadata.attr, + &llbitmap_daemon_sleep.attr, + &llbitmap_barrier_idle.attr, + NULL +}; + +static struct attribute_group md_llbitmap_group = { + .name = "llbitmap", + .attrs = md_llbitmap_attrs, +}; + +static struct bitmap_operations llbitmap_ops = { + .head = { + .type = MD_BITMAP, + .id = ID_LLBITMAP, + .name = "llbitmap", + }, + + .enabled = llbitmap_enabled, + .create = llbitmap_create, + .resize = llbitmap_resize, + .load = llbitmap_load, + .destroy = llbitmap_destroy, + + .start_write = llbitmap_start_write, + .end_write = llbitmap_end_write, + .start_discard = llbitmap_start_discard, + .end_discard = llbitmap_end_discard, + .unplug = llbitmap_unplug, + .flush = llbitmap_flush, + + .start_behind_write = llbitmap_start_behind_write, + .end_behind_write = llbitmap_end_behind_write, + .wait_behind_writes = llbitmap_wait_behind_writes, + + .blocks_synced = llbitmap_blocks_synced, + .skip_sync_blocks = llbitmap_skip_sync_blocks, + .start_sync = llbitmap_start_sync, + .end_sync = llbitmap_end_sync, + .close_sync = llbitmap_close_sync, + .cond_end_sync = llbitmap_cond_end_sync, + + .update_sb = llbitmap_update_sb, + .get_stats = llbitmap_get_stats, + .dirty_bits = llbitmap_dirty_bits, + .write_all = llbitmap_write_all, + + .group = &md_llbitmap_group, +}; + +int md_llbitmap_init(void) +{ + md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!md_llbitmap_io_wq) + return -ENOMEM; + + md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!md_llbitmap_unplug_wq) { + destroy_workqueue(md_llbitmap_io_wq); + md_llbitmap_io_wq = NULL; + return -ENOMEM; + } + + return register_md_submodule(&llbitmap_ops.head); +} + +void md_llbitmap_exit(void) +{ + destroy_workqueue(md_llbitmap_io_wq); + md_llbitmap_io_wq = NULL; + destroy_workqueue(md_llbitmap_unplug_wq); + md_llbitmap_unplug_wq = NULL; + unregister_md_submodule(&llbitmap_ops.head); +} diff --git a/drivers/md/md.c b/drivers/md/md.c index aebe12b0ee27..e5922a682953 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -79,16 +79,10 @@ static const char *action_name[NR_SYNC_ACTIONS] = { [ACTION_IDLE] = "idle", }; -/* pers_list is a list of registered personalities protected by pers_lock. */ -static LIST_HEAD(pers_list); -static DEFINE_SPINLOCK(pers_lock); +static DEFINE_XARRAY(md_submodule); static const struct kobj_type md_ktype; -const struct md_cluster_operations *md_cluster_ops; -EXPORT_SYMBOL(md_cluster_ops); -static struct module *md_cluster_mod; - static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; @@ -100,13 +94,12 @@ static struct workqueue_struct *md_wq; * workqueue whith reconfig_mutex grabbed. */ static struct workqueue_struct *md_misc_wq; -struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); -static void md_wakeup_thread_directly(struct md_thread __rcu *thread); +static void md_wakeup_thread_directly(struct md_thread __rcu **thread); /* * Default number of read corrections we'll attempt on an rdev @@ -117,32 +110,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* Default safemode delay: 200 msec */ #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' - * is 1000 KB/sec, so the extra system load does not show up that much. - * Increase it if you want to have more _guaranteed_ speed. Note that - * the RAID driver will use the maximum available bandwidth if the IO - * subsystem is idle. There is also an 'absolute maximum' reconstruction - * speed limit - in case reconstruction slows down your system despite - * idle IO detection. + * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' + * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load + * does not show up that much. Increase it if you want to have more guaranteed + * speed. Note that the RAID driver will use the maximum bandwidth + * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. + * + * Background sync IO speed control: + * + * - below speed min: + * no limit; + * - above speed min and below speed max: + * a) if mddev is idle, then no limit; + * b) if mddev is busy handling normal IO, then limit inflight sync IO + * to sync_io_depth; + * - above speed max: + * sync IO can't be issued; * - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. - * or /sys/block/mdX/md/sync_speed_{min,max} + * Following configurations can be changed via /proc/sys/dev/raid/ for system + * or /sys/block/mdX/md/ for one array. */ - static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; -static inline int speed_min(struct mddev *mddev) +static int sysctl_sync_io_depth = 32; + +static int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } -static inline int speed_max(struct mddev *mddev) +static int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; } +static int sync_io_depth(struct mddev *mddev) +{ + return mddev->sync_io_depth ? + mddev->sync_io_depth : sysctl_sync_io_depth; +} + static void rdev_uninit_serial(struct md_rdev *rdev) { if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) @@ -294,19 +303,26 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) static struct ctl_table_header *raid_table_header; -static struct ctl_table raid_table[] = { +static const struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, + .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_io_depth", + .data = &sysctl_sync_io_depth, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_dointvec, }, }; @@ -322,6 +338,8 @@ static int start_readonly; * so all the races disappear. */ static bool create_on_open = true; +static bool legacy_async_del_gendisk = true; +static bool check_new_feature = true; /* * We have a system wide 'event count' that is incremented @@ -619,9 +637,12 @@ static void __mddev_put(struct mddev *mddev) mddev->ctime || mddev->hold_active) return; - /* Array is not configured at all, and not held active, so destroy it */ + /* + * If array is freed by stopping array, MD_DELETED is set by + * do_md_stop(), MD_DELETED is still set here in case mddev is freed + * directly by closing a mddev that is created by create_on_open. + */ set_bit(MD_DELETED, &mddev->flags); - /* * Call queue_work inside the spinlock so that flush_workqueue() after * mddev_find will succeed in waiting for the work to be done. @@ -629,6 +650,12 @@ static void __mddev_put(struct mddev *mddev) queue_work(md_misc_wq, &mddev->del_work); } +static void mddev_put_locked(struct mddev *mddev) +{ + if (atomic_dec_and_test(&mddev->active)) + __mddev_put(mddev); +} + void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) @@ -650,8 +677,66 @@ static void active_io_release(struct percpu_ref *ref) static void no_op(struct percpu_ref *r) {} +static bool mddev_set_bitmap_ops(struct mddev *mddev) +{ + struct bitmap_operations *old = mddev->bitmap_ops; + struct md_submodule_head *head; + + if (mddev->bitmap_id == ID_BITMAP_NONE || + (old && old->head.id == mddev->bitmap_id)) + return true; + + xa_lock(&md_submodule); + head = xa_load(&md_submodule, mddev->bitmap_id); + + if (!head) { + pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + if (head->type != MD_BITMAP) { + pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + mddev->bitmap_ops = (void *)head; + xa_unlock(&md_submodule); + + if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) { + if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) + pr_warn("md: cannot register extra bitmap attributes for %s\n", + mdname(mddev)); + else + /* + * Inform user with KOBJ_CHANGE about new bitmap + * attributes. + */ + kobject_uevent(&mddev->kobj, KOBJ_CHANGE); + } + return true; + +err: + xa_unlock(&md_submodule); + return false; +} + +static void mddev_clear_bitmap_ops(struct mddev *mddev) +{ + if (!mddev_is_dm(mddev) && mddev->bitmap_ops && + mddev->bitmap_ops->group) + sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group); + + mddev->bitmap_ops = NULL; +} + int mddev_init(struct mddev *mddev) { + int err = 0; + + if (!IS_ENABLED(CONFIG_MD_BITMAP)) + mddev->bitmap_id = ID_BITMAP_NONE; + else + mddev->bitmap_id = ID_BITMAP; if (percpu_ref_init(&mddev->active_io, active_io_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) @@ -659,10 +744,23 @@ int mddev_init(struct mddev *mddev) if (percpu_ref_init(&mddev->writes_pending, no_op, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { - percpu_ref_exit(&mddev->active_io); - return -ENOMEM; + err = -ENOMEM; + goto exit_acitve_io; } + err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) + goto exit_writes_pending; + + err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) + goto exit_bio_set; + + err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, + offsetof(struct md_io_clone, bio_clone), 0); + if (err) + goto exit_sync_set; + /* We want to start with the refcount at zero */ percpu_ref_put(&mddev->writes_pending); @@ -686,17 +784,29 @@ int mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; - mddev_set_bitmap_ops(mddev); INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); return 0; + +exit_sync_set: + bioset_exit(&mddev->sync_set); +exit_bio_set: + bioset_exit(&mddev->bio_set); +exit_writes_pending: + percpu_ref_exit(&mddev->writes_pending); +exit_acitve_io: + percpu_ref_exit(&mddev->active_io); + return err; } EXPORT_SYMBOL_GPL(mddev_init); void mddev_destroy(struct mddev *mddev) { + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); + bioset_exit(&mddev->io_clone_set); percpu_ref_exit(&mddev->active_io); percpu_ref_exit(&mddev->writes_pending); } @@ -850,6 +960,22 @@ void mddev_unlock(struct mddev *mddev) kobject_del(&rdev->kobj); export_rdev(rdev, mddev); } + + if (!legacy_async_del_gendisk) { + /* + * Call del_gendisk after release reconfig_mutex to avoid + * deadlock (e.g. call del_gendisk under the lock and an + * access to sysfs files waits the lock) + * And MD_DELETED is only used for md raid which is set in + * do_md_stop. dm raid only uses md_stop to stop. So dm raid + * doesn't need to check MD_DELETED when getting reconfig lock + */ + if (test_bit(MD_DELETED, &mddev->flags) && + !test_and_set_bit(MD_DO_DELETE, &mddev->flags)) { + kobject_del(&mddev->kobj); + del_gendisk(mddev->gendisk); + } + } } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -888,16 +1014,40 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) } EXPORT_SYMBOL_GPL(md_find_rdev_rcu); -static struct md_personality *find_pers(int level, char *clevel) +static struct md_personality *get_pers(int level, char *clevel) { - struct md_personality *pers; - list_for_each_entry(pers, &pers_list, list) { - if (level != LEVEL_NONE && pers->level == level) - return pers; - if (strcmp(pers->name, clevel)==0) - return pers; + struct md_personality *ret = NULL; + struct md_submodule_head *head; + unsigned long i; + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_PERSONALITY) + continue; + if ((level != LEVEL_NONE && head->id == level) || + !strcmp(head->name, clevel)) { + if (try_module_get(head->owner)) + ret = (void *)head; + break; + } } - return NULL; + xa_unlock(&md_submodule); + + if (!ret) { + if (level != LEVEL_NONE) + pr_warn("md: personality for level %d is not loaded!\n", + level); + else + pr_warn("md: personality for level %s is not loaded!\n", + clevel); + } + + return ret; +} + +static void put_pers(struct md_personality *pers) +{ + module_put(pers->head.owner); } /* return the offset of the super block in 512byte sectors */ @@ -956,15 +1106,26 @@ static void super_written(struct bio *bio) wake_up(&mddev->sb_wait); } -void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page) +/** + * md_write_metadata - write metadata to underlying disk, including + * array superblock, badblocks, bitmap superblock and bitmap bits. + * @mddev: the array to write + * @rdev: the underlying disk to write + * @sector: the offset to @rdev + * @size: the length of the metadata + * @page: the metadata + * @offset: the offset to @page + * + * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment + * mddev->pending_writes before returning, and decrement it on completion, + * waking up sb_wait. Caller must call md_super_wait() after issuing io to all + * rdev. If an error occurred, md_error() will be called, and the @rdev will be + * kicked out from @mddev. + */ +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset) { - /* write first size bytes of page to sector of rdev - * Increment mddev->pending_writes before returning - * and decrement it on completion, waking up sb_wait - * if zero is reached. - * If an error occurred, call md_error - */ struct bio *bio; if (!page) @@ -982,7 +1143,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector; - __bio_add_page(bio, page, size, 0); + __bio_add_page(bio, page, size, offset); bio->bi_private = rdev; bio->bi_end_io = super_written; @@ -1180,7 +1341,7 @@ int md_check_no_bitmap(struct mddev *mddev) if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; pr_warn("%s: bitmaps are not supported for %s\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); return 1; } EXPORT_SYMBOL(md_check_no_bitmap); @@ -1292,6 +1453,9 @@ static u64 md_bitmap_events_cleared(struct mddev *mddev) struct md_bitmap_stats stats; int err; + if (!md_bitmap_enabled(mddev, false)) + return 0; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return 0; @@ -1355,13 +1519,13 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru mddev->layout = -1; if (sb->state & (1<<MD_SB_CLEAN)) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else { if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { - mddev->recovery_cp = sb->recovery_cp; + mddev->resync_offset = sb->recovery_cp; } else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; } memcpy(mddev->uuid+0, &sb->set_uuid0, 4); @@ -1487,10 +1651,10 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) mddev->minor_version = sb->minor_version; if (mddev->in_sync) { - sb->recovery_cp = mddev->recovery_cp; + sb->recovery_cp = mddev->resync_offset; sb->cp_events_hi = (mddev->events>>32); sb->cp_events_lo = (u32)mddev->events; - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) sb->state = (1<< MD_SB_CLEAN); } else sb->recovery_cp = 0; @@ -1589,8 +1753,8 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } @@ -1688,9 +1852,13 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ } if (sb->pad0 || sb->pad3[0] || - memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) - /* Some padding is non-zero, might be a new feature */ - return -EINVAL; + memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) { + pr_warn("Some padding is non-zero on %pg, might be a new feature\n", + rdev->bdev); + if (check_new_feature) + return -EINVAL; + pr_warn("check_new_feature is disabled, data corruption possible\n"); + } rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); @@ -1748,7 +1916,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (badblocks_set(&rdev->badblocks, sector, count, 1)) + if (!badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) @@ -1831,6 +1999,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->dev_sectors = le64_to_cpu(sb->size); + mddev->logical_block_size = le32_to_cpu(sb->logical_block_size); mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.space = 0; @@ -1841,7 +2010,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc mddev->bitmap_info.default_space = (4096-1024) >> 9; mddev->reshape_backwards = 0; - mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + mddev->resync_offset = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; @@ -2027,7 +2196,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->utime = cpu_to_le64((__u64)mddev->utime); sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) - sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + sb->resync_offset = cpu_to_le64(mddev->resync_offset); else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) sb->resync_offset = cpu_to_le64(MaxSector); else @@ -2040,6 +2209,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->chunksize = cpu_to_le32(mddev->chunk_sectors); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); + sb->logical_block_size = cpu_to_le32(mddev->logical_block_size); if (test_bit(FailFast, &rdev->flags)) sb->devflags |= FailFast1; else @@ -2238,8 +2408,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; @@ -2249,13 +2419,15 @@ static int super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { + struct mddev *mddev = rdev->mddev; + /* All necessary checks on new >= old have been done */ if (new_offset >= rdev->data_offset) return 1; /* with 1.0 metadata, there is no metadata to tread on * so we can always move back */ - if (rdev->mddev->minor_version == 0) + if (mddev->minor_version == 0) return 1; /* otherwise we must be sure not to step on @@ -2267,8 +2439,7 @@ super_1_allow_new_offset(struct md_rdev *rdev, if (rdev->sb_start + (32+4)*2 > new_offset) return 0; - if (!rdev->mddev->bitmap_info.file) { - struct mddev *mddev = rdev->mddev; + if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) { struct md_bitmap_stats stats; int err; @@ -2359,19 +2530,6 @@ int md_integrity_register(struct mddev *mddev) return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || - (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { - /* - * No need to handle the failure of bioset_integrity_create, - * because the function is called by md_run() -> pers->run(), - * md_run calls bioset_exit -> bioset_integrity_free in case - * of failure case. - */ - pr_err("md: failed to create integrity pool for %s\n", - mdname(mddev)); - return -EINVAL; - } return 0; } EXPORT_SYMBOL(md_integrity_register); @@ -2630,6 +2788,7 @@ void md_update_sb(struct mddev *mddev, int force_change) if (!md_is_rdwr(mddev)) { if (force_change) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev)); return; } @@ -2639,11 +2798,11 @@ repeat: force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) nospares = 1; - ret = md_cluster_ops->metadata_update_start(mddev); + ret = mddev->cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) - md_cluster_ops->metadata_update_cancel(mddev); + mddev->cluster_ops->metadata_update_cancel(mddev); bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)); @@ -2720,7 +2879,7 @@ repeat: /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ if (nospares - && (mddev->in_sync && mddev->recovery_cp == MaxSector) + && (mddev->in_sync && mddev->resync_offset == MaxSector) && mddev->can_decrease_events && mddev->events != 1) { mddev->events--; @@ -2753,24 +2912,24 @@ repeat: mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: - mddev->bitmap_ops->update_sb(mddev->bitmap); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ if (!test_bit(Faulty, &rdev->flags)) { - md_super_write(mddev,rdev, - rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); pr_debug("md: (write) %pg's sb offset: %llu\n", rdev->bdev, (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; if (rdev->badblocks.size) { - md_super_write(mddev, rdev, - rdev->badblocks.sector, - rdev->badblocks.size << 9, - rdev->bb_page); + md_write_metadata(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page, 0); rdev->badblocks.size = 0; } @@ -2783,7 +2942,7 @@ rewrite: /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ if (mddev_is_clustered(mddev) && ret == 0) - md_cluster_ops->metadata_update_finish(mddev); + mddev->cluster_ops->metadata_update_finish(mddev); if (mddev->in_sync != sync_req || !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), @@ -2942,7 +3101,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) else { err = 0; if (mddev_is_clustered(mddev)) - err = md_cluster_ops->remove_disk(mddev, rdev); + err = mddev->cluster_ops->remove_disk(mddev, rdev); if (err == 0) { md_kick_rdev_from_array(rdev); @@ -3052,7 +3211,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * by this node eventually */ if (!mddev_is_clustered(rdev->mddev) || - (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { + (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { clear_bit(Faulty, &rdev->flags); err = add_bound_rdev(rdev); } @@ -3860,7 +4019,7 @@ level_show(struct mddev *mddev, char *page) spin_lock(&mddev->lock); p = mddev->pers; if (p) - ret = sprintf(page, "%s\n", p->name); + ret = sprintf(page, "%s\n", p->head.name); else if (mddev->clevel[0]) ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) @@ -3917,7 +4076,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) rv = -EINVAL; if (!mddev->pers->quiesce) { pr_warn("md: %s: %s does not support online personality change\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); goto out_unlock; } @@ -3931,24 +4090,20 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (request_module("md-%s", clevel) != 0) request_module("md-level-%s", clevel); - spin_lock(&pers_lock); - pers = find_pers(level, clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - pr_warn("md: personality %s not loaded\n", clevel); + pers = get_pers(level, clevel); + if (!pers) { rv = -EINVAL; goto out_unlock; } - spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ - module_put(pers->owner); + put_pers(pers); rv = len; goto out_unlock; } if (!pers->takeover) { - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); rv = -EINVAL; @@ -3969,7 +4124,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; mddev->reshape_backwards = 0; - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s would not accept array\n", mdname(mddev), clevel); rv = PTR_ERR(priv); @@ -3984,7 +4139,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) oldpriv = mddev->private; mddev->pers = pers; mddev->private = priv; - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; @@ -4026,7 +4181,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->to_remove = &md_redundancy_group; } - module_put(oldpers->owner); + put_pers(oldpers); rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) @@ -4057,7 +4212,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) * it must always be in_sync */ mddev->in_sync = 1; - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); } pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -4103,6 +4258,86 @@ static struct md_sysfs_entry md_new_level = __ATTR(new_level, 0664, new_level_show, new_level_store); static ssize_t +bitmap_type_show(struct mddev *mddev, char *page) +{ + struct md_submodule_head *head; + unsigned long i; + ssize_t len = 0; + + if (mddev->bitmap_id == ID_BITMAP_NONE) + len += sprintf(page + len, "[none] "); + else + len += sprintf(page + len, "none "); + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_BITMAP) + continue; + + if (mddev->bitmap_id == head->id) + len += sprintf(page + len, "[%s] ", head->name); + else + len += sprintf(page + len, "%s ", head->name); + } + xa_unlock(&md_submodule); + + len += sprintf(page + len, "\n"); + return len; +} + +static ssize_t +bitmap_type_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct md_submodule_head *head; + enum md_submodule_id id; + unsigned long i; + int err = 0; + + xa_lock(&md_submodule); + + if (mddev->bitmap_ops) { + err = -EBUSY; + goto out; + } + + if (cmd_match(buf, "none")) { + mddev->bitmap_id = ID_BITMAP_NONE; + goto out; + } + + xa_for_each(&md_submodule, i, head) { + if (head->type == MD_BITMAP && cmd_match(buf, head->name)) { + mddev->bitmap_id = head->id; + goto out; + } + } + + err = kstrtoint(buf, 10, &id); + if (err) + goto out; + + if (id == ID_BITMAP_NONE) { + mddev->bitmap_id = id; + goto out; + } + + head = xa_load(&md_submodule, id); + if (head && head->type == MD_BITMAP) { + mddev->bitmap_id = id; + goto out; + } + + err = -ENOENT; + +out: + xa_unlock(&md_submodule); + return err ? err : len; +} + +static struct md_sysfs_entry md_bitmap_type = +__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store); + +static ssize_t layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ @@ -4260,9 +4495,9 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(struct mddev *mddev, char *page) { - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) return sprintf(page, "none\n"); - return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); + return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); } static ssize_t @@ -4288,7 +4523,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) err = -EBUSY; if (!err) { - mddev->recovery_cp = n; + mddev->resync_offset = n; if (mddev->pers) set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } @@ -4633,6 +4868,9 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) unsigned long chunk, end_chunk; int err; + if (!md_bitmap_enabled(mddev, false)) + return len; + err = mddev_lock(mddev); if (err) return err; @@ -4792,9 +5030,42 @@ out_unlock: static struct md_sysfs_entry md_metadata = __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); +static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors) +{ + return rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sectors; +} + +static enum sync_action md_get_active_sync_action(struct mddev *mddev) +{ + struct md_rdev *rdev; + bool is_recover = false; + + if (mddev->resync_offset < MaxSector) + return ACTION_RESYNC; + + if (mddev->reshape_position != MaxSector) + return ACTION_RESHAPE; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev_needs_recovery(rdev, MaxSector)) { + is_recover = true; + break; + } + } + rcu_read_unlock(); + + return is_recover ? ACTION_RECOVER : ACTION_IDLE; +} + enum sync_action md_sync_action(struct mddev *mddev) { unsigned long recovery = mddev->recovery; + enum sync_action active_action; /* * frozen has the highest priority, means running sync_thread will be @@ -4818,8 +5089,17 @@ enum sync_action md_sync_action(struct mddev *mddev) !test_bit(MD_RECOVERY_NEEDED, &recovery)) return ACTION_IDLE; - if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || - mddev->reshape_position != MaxSector) + /* + * Check if any sync operation (resync/recover/reshape) is + * currently active. This ensures that only one sync operation + * can run at a time. Returns the type of active operation, or + * ACTION_IDLE if none are active. + */ + active_action = md_get_active_sync_action(mddev); + if (active_action != ACTION_IDLE) + return active_action; + + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return ACTION_RESHAPE; if (test_bit(MD_RECOVERY_RECOVER, &recovery)) @@ -4893,7 +5173,7 @@ static void stop_sync_thread(struct mddev *mddev, bool locked) * Thread might be blocked waiting for metadata update which will now * never happen */ - md_wakeup_thread_directly(mddev->sync_thread); + md_wakeup_thread_directly(&mddev->sync_thread); if (work_pending(&mddev->sync_work)) flush_work(&mddev->sync_work); @@ -5084,7 +5364,7 @@ static ssize_t sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), - mddev->sync_speed_min ? "local": "system"); + mddev->sync_speed_min ? "local" : "system"); } static ssize_t @@ -5093,7 +5373,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len) unsigned int min; int rv; - if (strncmp(buf, "system", 6)==0) { + if (strncmp(buf, "system", 6) == 0) { min = 0; } else { rv = kstrtouint(buf, 10, &min); @@ -5113,7 +5393,7 @@ static ssize_t sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), - mddev->sync_speed_max ? "local": "system"); + mddev->sync_speed_max ? "local" : "system"); } static ssize_t @@ -5122,7 +5402,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len) unsigned int max; int rv; - if (strncmp(buf, "system", 6)==0) { + if (strncmp(buf, "system", 6) == 0) { max = 0; } else { rv = kstrtouint(buf, 10, &max); @@ -5139,6 +5419,35 @@ static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t +sync_io_depth_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), + mddev->sync_io_depth ? "local" : "system"); +} + +static ssize_t +sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int max; + int rv; + + if (strncmp(buf, "system", 6) == 0) { + max = 0; + } else { + rv = kstrtouint(buf, 10, &max); + if (rv < 0) + return rv; + if (max == 0) + return -EINVAL; + } + mddev->sync_io_depth = max; + return len; +} + +static struct md_sysfs_entry md_sync_io_depth = +__ATTR_RW(sync_io_depth); + +static ssize_t degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); @@ -5584,7 +5893,7 @@ __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, static ssize_t serialize_policy_show(struct mddev *mddev, char *page) { - if (mddev->pers == NULL || (mddev->pers->level != 1)) + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) return sprintf(page, "n/a\n"); else return sprintf(page, "%d\n", mddev->serialize_policy); @@ -5610,7 +5919,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) err = mddev_suspend_and_lock(mddev); if (err) return err; - if (mddev->pers == NULL || (mddev->pers->level != 1)) { + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { pr_err("md: serialize_policy is only effective for raid1\n"); err = -EINVAL; goto unlock; @@ -5630,10 +5939,73 @@ static struct md_sysfs_entry md_serialize_policy = __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, serialize_policy_store); +static int mddev_set_logical_block_size(struct mddev *mddev, + unsigned int lbs) +{ + int err = 0; + struct queue_limits lim; + + if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) { + pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n", + mdname(mddev), lbs); + return -EINVAL; + } + + lim = queue_limits_start_update(mddev->gendisk->queue); + lim.logical_block_size = lbs; + pr_info("%s: logical_block_size is changed, data may be lost\n", + mdname(mddev)); + err = queue_limits_commit_update(mddev->gendisk->queue, &lim); + if (err) + return err; + + mddev->logical_block_size = lbs; + /* New lbs will be written to superblock after array is running */ + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + return 0; +} + +static ssize_t +lbs_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%u\n", mddev->logical_block_size); +} + +static ssize_t +lbs_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int lbs; + int err = -EBUSY; + + /* Only 1.x meta supports configurable LBS */ + if (mddev->major_version == 0) + return -EINVAL; + + if (mddev->pers) + return -EBUSY; + + err = kstrtouint(buf, 10, &lbs); + if (err < 0) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + goto unlock; + + err = mddev_set_logical_block_size(mddev, lbs); + +unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_logical_block_size = +__ATTR(logical_block_size, 0644, lbs_show, lbs_store); static struct attribute *md_default_attrs[] = { &md_level.attr, &md_new_level.attr, + &md_bitmap_type.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, @@ -5651,6 +6023,7 @@ static struct attribute *md_default_attrs[] = { &md_consistency_policy.attr, &md_fail_last_dev.attr, &md_serialize_policy.attr, + &md_logical_block_size.attr, NULL, }; @@ -5664,6 +6037,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, + &md_sync_io_depth.attr, &md_sync_speed.attr, &md_sync_force_parallel.attr, &md_sync_completed.attr, @@ -5682,7 +6056,6 @@ static const struct attribute_group md_redundancy_group = { static const struct attribute_group *md_attr_groups[] = { &md_default_group, - &md_bitmap_group, NULL, }; @@ -5714,19 +6087,30 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; + struct kernfs_node *kn = NULL; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + if (entry->store == array_state_store && cmd_match(page, "clear")) + kn = sysfs_break_active_protection(kobj, attr); + spin_lock(&all_mddevs_lock); if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); + if (kn) + sysfs_unbreak_active_protection(kn); return -EBUSY; } spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); mddev_put(mddev); + + if (kn) + sysfs_unbreak_active_protection(kn); + return rv; } @@ -5734,12 +6118,13 @@ static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); - if (mddev->sysfs_state) - sysfs_put(mddev->sysfs_state); - if (mddev->sysfs_level) - sysfs_put(mddev->sysfs_level); - - del_gendisk(mddev->gendisk); + if (legacy_async_del_gendisk) { + if (mddev->sysfs_state) + sysfs_put(mddev->sysfs_state); + if (mddev->sysfs_level) + sysfs_put(mddev->sysfs_level); + del_gendisk(mddev->gendisk); + } put_disk(mddev->gendisk); } @@ -5769,6 +6154,17 @@ int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, return -EINVAL; } + /* + * Before RAID adding folio support, the logical_block_size + * should be smaller than the page size. + */ + if (lim->logical_block_size > PAGE_SIZE) { + pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n", + mdname(mddev)); + return -EINVAL; + } + mddev->logical_block_size = lim->logical_block_size; + return 0; } EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); @@ -5781,6 +6177,13 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) if (mddev_is_dm(mddev)) return 0; + if (queue_logical_block_size(rdev->bdev->bd_disk->queue) > + queue_logical_block_size(mddev->gendisk->queue)) { + pr_err("%s: incompatible logical_block_size, can not add\n", + mdname(mddev)); + return -EINVAL; + } + lim = queue_limits_start_update(mddev->gendisk->queue); queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); @@ -5943,6 +6346,9 @@ static int md_alloc_and_put(dev_t dev, char *name) { struct mddev *mddev = md_alloc(dev, name); + if (legacy_async_del_gendisk) + pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n"); + if (IS_ERR(mddev)) return PTR_ERR(mddev); mddev_put(mddev); @@ -5988,7 +6394,7 @@ static int add_named_array(const char *val, const struct kernel_param *kp) static void md_safemode_timeout(struct timer_list *t) { - struct mddev *mddev = from_timer(mddev, t, safemode_timer); + struct mddev *mddev = timer_container_of(mddev, t, safemode_timer); mddev->safemode = 1; if (mddev->external) @@ -5999,6 +6405,26 @@ static void md_safemode_timeout(struct timer_list *t) static int start_dirty_degraded; +static int md_bitmap_create(struct mddev *mddev) +{ + if (mddev->bitmap_id == ID_BITMAP_NONE) + return -EINVAL; + + if (!mddev_set_bitmap_ops(mddev)) + return -ENOENT; + + return mddev->bitmap_ops->create(mddev); +} + +static void md_bitmap_destroy(struct mddev *mddev) +{ + if (!md_bitmap_registered(mddev)) + return; + + mddev->bitmap_ops->destroy(mddev); + mddev_clear_bitmap_ops(mddev); +} + int md_run(struct mddev *mddev) { int err; @@ -6078,50 +6504,20 @@ int md_run(struct mddev *mddev) nowait = nowait && bdev_nowait(rdev->bdev); } - if (!bioset_initialized(&mddev->bio_set)) { - err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (err) - return err; - } - if (!bioset_initialized(&mddev->sync_set)) { - err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (err) - goto exit_bio_set; - } - - if (!bioset_initialized(&mddev->io_clone_set)) { - err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, - offsetof(struct md_io_clone, bio_clone), 0); - if (err) - goto exit_sync_set; - } - - spin_lock(&pers_lock); - pers = find_pers(mddev->level, mddev->clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - if (mddev->level != LEVEL_NONE) - pr_warn("md: personality for level %d is not loaded!\n", - mddev->level); - else - pr_warn("md: personality for level %s is not loaded!\n", - mddev->clevel); - err = -EINVAL; - goto abort; - } - spin_unlock(&pers_lock); - if (mddev->level != pers->level) { - mddev->level = pers->level; - mddev->new_level = pers->level; + pers = get_pers(mddev->level, mddev->clevel); + if (!pers) + return -EINVAL; + if (mddev->level != pers->head.id) { + mddev->level = pers->head.id; + mddev->new_level = pers->head.id; } - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ - module_put(pers->owner); - err = -EINVAL; - goto abort; + put_pers(pers); + return -EINVAL; } if (pers->sync_request) { @@ -6174,7 +6570,7 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = mddev->bitmap_ops->create(mddev, -1); + err = md_bitmap_create(mddev); if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); @@ -6246,14 +6642,8 @@ bitmap_abort: if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; - module_put(pers->owner); - mddev->bitmap_ops->destroy(mddev); -abort: - bioset_exit(&mddev->io_clone_set); -exit_sync_set: - bioset_exit(&mddev->sync_set); -exit_bio_set: - bioset_exit(&mddev->bio_set); + put_pers(pers); + md_bitmap_destroy(mddev); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -6267,10 +6657,12 @@ int do_md_run(struct mddev *mddev) if (err) goto out; - err = mddev->bitmap_ops->load(mddev); - if (err) { - mddev->bitmap_ops->destroy(mddev); - goto out; + if (md_bitmap_registered(mddev)) { + err = mddev->bitmap_ops->load(mddev); + if (err) { + md_bitmap_destroy(mddev); + goto out; + } } if (mddev_is_clustered(mddev)) @@ -6354,7 +6746,7 @@ static void md_clean(struct mddev *mddev) mddev->external_size = 0; mddev->dev_sectors = 0; mddev->raid_disks = 0; - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; @@ -6362,21 +6754,29 @@ static void md_clean(struct mddev *mddev) mddev->persistent = 0; mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; + /* - * Don't clear MD_CLOSING, or mddev can be opened again. - * 'hold_active != 0' means mddev is still in the creation - * process and will be used later. + * For legacy_async_del_gendisk mode, it can stop the array in the + * middle of assembling it, then it still can access the array. So + * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk, + * it can't open the array again after stopping it. So it doesn't + * clear MD_CLOSING. */ - if (mddev->hold_active) - mddev->flags = 0; - else + if (legacy_async_del_gendisk && mddev->hold_active) { + clear_bit(MD_CLOSING, &mddev->flags); + } else { + /* if UNTIL_STOP is set, it's cleared here */ + mddev->hold_active = 0; + /* Don't clear MD_CLOSING, or mddev can be opened again. */ mddev->flags &= BIT_ULL_MASK(MD_CLOSING); + } mddev->sb_flags = 0; mddev->ro = MD_RDWR; mddev->metadata_type[0] = 0; mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; mddev->layout = 0; + mddev->logical_block_size = 0; mddev->max_disks = 0; mddev->events = 0; mddev->can_decrease_events = 0; @@ -6407,14 +6807,15 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - mddev->bitmap_ops->flush(mddev); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || @@ -6441,7 +6842,8 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - mddev->bitmap_ops->wait_behind_writes(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -6457,7 +6859,7 @@ static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; @@ -6465,14 +6867,8 @@ static void __md_stop(struct mddev *mddev) if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; - if (pers->sync_request && mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - module_put(pers->owner); + put_pers(pers); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - - bioset_exit(&mddev->bio_set); - bioset_exit(&mddev->sync_set); - bioset_exit(&mddev->io_clone_set); } void md_stop(struct mddev *mddev) @@ -6563,6 +6959,10 @@ static int do_md_stop(struct mddev *mddev, int mode) if (!md_is_rdwr(mddev)) set_disk_ro(disk, 0); + if (mode == 2 && mddev->pers->sync_request && + mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + __md_stop_writes(mddev); __md_stop(mddev); @@ -6595,10 +6995,9 @@ static int do_md_stop(struct mddev *mddev, int mode) mddev->bitmap_info.offset = 0; export_array(mddev); - md_clean(mddev); - if (mddev->hold_active == UNTIL_STOP) - mddev->hold_active = 0; + if (!legacy_async_del_gendisk) + set_bit(MD_DELETED, &mddev->flags); } md_new_event(); sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -6983,7 +7382,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) set_bit(Candidate, &rdev->flags); else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ - err = md_cluster_ops->add_new_disk(mddev, rdev); + err = mddev->cluster_ops->add_new_disk(mddev, rdev); if (err) { export_rdev(rdev, mddev); return err; @@ -7000,14 +7399,14 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { if (!err) { - err = md_cluster_ops->new_disk_ack(mddev, - err == 0); + err = mddev->cluster_ops->new_disk_ack( + mddev, err == 0); if (err) md_kick_rdev_from_array(rdev); } } else { if (err) - md_cluster_ops->add_new_disk_cancel(mddev); + mddev->cluster_ops->add_new_disk_cancel(mddev); else err = add_bound_rdev(rdev); } @@ -7087,10 +7486,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) { - if (md_cluster_ops->remove_disk(mddev, rdev)) - goto busy; - } + if (mddev_is_clustered(mddev) && + mddev->cluster_ops->remove_disk(mddev, rdev)) + goto busy; md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -7179,6 +7577,9 @@ static int set_bitmap_file(struct mddev *mddev, int fd) { int err = 0; + if (!md_bitmap_registered(mddev)) + return -EINVAL; + if (mddev->pers) { if (!mddev->pers->quiesce || !mddev->thread) return -EBUSY; @@ -7235,16 +7636,16 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - err = mddev->bitmap_ops->create(mddev, -1); + err = md_bitmap_create(mddev); if (!err) err = mddev->bitmap_ops->load(mddev); if (err) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); fd = -1; } } else if (fd < 0) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } } @@ -7309,9 +7710,9 @@ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) * openned */ if (info->state & (1<<MD_SB_CLEAN)) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->persistent = ! info->not_persistent; mddev->external = 0; @@ -7393,7 +7794,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) rv = mddev->pers->resize(mddev, num_sectors); if (!rv) { if (mddev_is_clustered(mddev)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); else if (!mddev_is_dm(mddev)) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); @@ -7441,6 +7842,28 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return rv; } +static int get_cluster_ops(struct mddev *mddev) +{ + xa_lock(&md_submodule); + mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); + if (mddev->cluster_ops && + !try_module_get(mddev->cluster_ops->head.owner)) + mddev->cluster_ops = NULL; + xa_unlock(&md_submodule); + + return mddev->cluster_ops == NULL ? -ENOENT : 0; +} + +static void put_cluster_ops(struct mddev *mddev) +{ + if (!mddev->cluster_ops) + return; + + mddev->cluster_ops->leave(mddev); + module_put(mddev->cluster_ops->head.owner); + mddev->cluster_ops = NULL; +} + /* * update_array_info is used to change the configuration of an * on-line array. @@ -7529,12 +7952,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - rv = mddev->bitmap_ops->create(mddev, -1); + rv = md_bitmap_create(mddev); if (!rv) rv = mddev->bitmap_ops->load(mddev); if (rv) - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } else { struct md_bitmap_stats stats; @@ -7549,19 +7972,18 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ - if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { + if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); rv = -EPERM; - md_cluster_ops->unlock_all_bitmaps(mddev); + mddev->cluster_ops->unlock_all_bitmaps(mddev); goto err; } mddev->bitmap_info.nodes = 0; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; } } @@ -7598,9 +8020,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ -static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mddev *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = disk->private_data; geo->heads = 2; geo->sectors = 4; @@ -7842,7 +8264,7 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, case CLUSTERED_DISK_NACK: if (mddev_is_clustered(mddev)) - md_cluster_ops->new_disk_ack(mddev, false); + mddev->cluster_ops->new_disk_ack(mddev, false); else err = -EINVAL; goto unlock; @@ -8045,22 +8467,21 @@ static int md_thread(void *arg) return 0; } -static void md_wakeup_thread_directly(struct md_thread __rcu *thread) +static void md_wakeup_thread_directly(struct md_thread __rcu **thread) { struct md_thread *t; rcu_read_lock(); - t = rcu_dereference(thread); + t = rcu_dereference(*thread); if (t) wake_up_process(t->tsk); rcu_read_unlock(); } -void md_wakeup_thread(struct md_thread __rcu *thread) +void __md_wakeup_thread(struct md_thread __rcu *thread) { struct md_thread *t; - rcu_read_lock(); t = rcu_dereference(thread); if (t) { pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); @@ -8068,9 +8489,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread) if (wq_has_sleeper(&t->wqueue)) wake_up(&t->wqueue); } - rcu_read_unlock(); } -EXPORT_SYMBOL(md_wakeup_thread); +EXPORT_SYMBOL(__md_wakeup_thread); struct md_thread *md_register_thread(void (*run) (struct md_thread *), struct mddev *mddev, const char *name) @@ -8124,7 +8544,8 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) return; mddev->pers->error_handler(mddev, rdev); - if (mddev->pers->level == 0) + if (mddev->pers->head.id == ID_RAID0 || + mddev->pers->head.id == ID_LINEAR) return; if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) @@ -8162,14 +8583,17 @@ static void status_unused(struct seq_file *seq) static void status_personalities(struct seq_file *seq) { - struct md_personality *pers; + struct md_submodule_head *head; + unsigned long i; seq_puts(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - spin_unlock(&pers_lock); + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) + if (head->type == MD_PERSONALITY) + seq_printf(seq, "[%s] ", head->name); + xa_unlock(&md_submodule); + seq_puts(seq, "\n"); } @@ -8225,7 +8649,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, "\tresync=REMOTE"); return 1; } - if (mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset < MaxSector) { seq_printf(seq, "\tresync=PENDING"); return 1; } @@ -8338,6 +8762,9 @@ static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) unsigned long chunk_kb; int err; + if (!md_bitmap_enabled(mddev, false)) + return; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return; @@ -8376,6 +8803,10 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; spin_unlock(&all_mddevs_lock); + + /* prevent bitmap to be freed after checking */ + mutex_lock(&mddev->bitmap_info.mutex); + spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : ", mdname(mddev)); @@ -8388,7 +8819,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " (read-only)"); if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); - seq_printf(seq, " %s", mddev->pers->name); + seq_printf(seq, " %s", mddev->pers->head.name); } else { seq_printf(seq, "inactive"); } @@ -8451,14 +8882,13 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } spin_unlock(&mddev->lock); + mutex_unlock(&mddev->bitmap_info.mutex); spin_lock(&all_mddevs_lock); if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) status_unused(seq); - if (atomic_dec_and_test(&mddev->active)) - __mddev_put(mddev); - + mddev_put_locked(mddev); return 0; } @@ -8509,67 +8939,34 @@ static const struct proc_ops mdstat_proc_ops = { .proc_poll = mdstat_poll, }; -int register_md_personality(struct md_personality *p) -{ - pr_debug("md: %s personality registered for level %d\n", - p->name, p->level); - spin_lock(&pers_lock); - list_add_tail(&p->list, &pers_list); - spin_unlock(&pers_lock); - return 0; -} -EXPORT_SYMBOL(register_md_personality); - -int unregister_md_personality(struct md_personality *p) -{ - pr_debug("md: %s personality unregistered\n", p->name); - spin_lock(&pers_lock); - list_del_init(&p->list); - spin_unlock(&pers_lock); - return 0; -} -EXPORT_SYMBOL(unregister_md_personality); - -int register_md_cluster_operations(const struct md_cluster_operations *ops, - struct module *module) +int register_md_submodule(struct md_submodule_head *msh) { - int ret = 0; - spin_lock(&pers_lock); - if (md_cluster_ops != NULL) - ret = -EALREADY; - else { - md_cluster_ops = ops; - md_cluster_mod = module; - } - spin_unlock(&pers_lock); - return ret; + return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); } -EXPORT_SYMBOL(register_md_cluster_operations); +EXPORT_SYMBOL_GPL(register_md_submodule); -int unregister_md_cluster_operations(void) +void unregister_md_submodule(struct md_submodule_head *msh) { - spin_lock(&pers_lock); - md_cluster_ops = NULL; - spin_unlock(&pers_lock); - return 0; + xa_erase(&md_submodule, msh->id); } -EXPORT_SYMBOL(unregister_md_cluster_operations); +EXPORT_SYMBOL_GPL(unregister_md_submodule); int md_setup_cluster(struct mddev *mddev, int nodes) { - int ret; - if (!md_cluster_ops) + int ret = get_cluster_ops(mddev); + + if (ret) { request_module("md-cluster"); - spin_lock(&pers_lock); + ret = get_cluster_ops(mddev); + } + /* ensure module won't be unloaded */ - if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { + if (ret) { pr_warn("can't find md-cluster module or get its reference.\n"); - spin_unlock(&pers_lock); - return -ENOENT; + return ret; } - spin_unlock(&pers_lock); - ret = md_cluster_ops->join(mddev, nodes); + ret = mddev->cluster_ops->join(mddev, nodes); if (!ret) mddev->safemode_delay = 0; return ret; @@ -8577,56 +8974,58 @@ int md_setup_cluster(struct mddev *mddev, int nodes) void md_cluster_stop(struct mddev *mddev) { - if (!md_cluster_ops) - return; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); } -static int is_mddev_idle(struct mddev *mddev, int init) +static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) { + unsigned long last_events = rdev->last_events; + + if (!bdev_is_partition(rdev->bdev)) + return true; + + /* + * If rdev is partition, and user doesn't issue IO to the array, the + * array is still not idle if user issues IO to other partitions. + */ + rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, + sectors) - + part_stat_read_accum(rdev->bdev, sectors); + + return init || rdev->last_events <= last_events; +} + +/* + * mddev is idle if following conditions are matched since last check: + * 1) mddev doesn't have normal IO completed; + * 2) mddev doesn't have inflight normal IO; + * 3) if any member disk is partition, and other partitions don't have IO + * completed; + * + * Noted this checking rely on IO accounting is enabled. + */ +static bool is_mddev_idle(struct mddev *mddev, int init) +{ + unsigned long last_events = mddev->normal_io_events; + struct gendisk *disk; struct md_rdev *rdev; - int idle; - int curr_events; + bool idle = true; - idle = 1; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) { - struct gendisk *disk = rdev->bdev->bd_disk; + disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; + if (!disk) + return true; - if (!init && !blk_queue_io_stat(disk->queue)) - continue; + mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); + if (!init && (mddev->normal_io_events > last_events || + bdev_count_inflight(disk->part0))) + idle = false; - curr_events = (int)part_stat_read_accum(disk->part0, sectors) - - atomic_read(&disk->sync_io); - /* sync IO will cause sync_io to increase before the disk_stats - * as sync_io is counted when a request starts, and - * disk_stats is counted when it completes. - * So resync activity will cause curr_events to be smaller than - * when there was no such activity. - * non-sync IO will cause disk_stat to increase without - * increasing sync_io so curr_events will (eventually) - * be larger than it was before. Once it becomes - * substantially larger, the test below will cause - * the array to appear non-idle, and resync will slow - * down. - * If there is a lot of outstanding resync activity when - * we set last_event to curr_events, then all that activity - * completing might cause the array to appear non-idle - * and resync will be slowed down even though there might - * not have been non-resync activity. This will only - * happen once though. 'last_events' will soon reflect - * the state where there is little or no outstanding - * resync requests, and further resync activity will - * always make curr_events less than last_events. - * - */ - if (init || curr_events - rdev->last_events > 64) { - rdev->last_events = curr_events; - idle = 0; - } - } + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (!is_rdev_holder_idle(rdev, init)) + idle = false; rcu_read_unlock(); + return idle; } @@ -8745,12 +9144,38 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); +static void md_bitmap_start(struct mddev *mddev, + struct md_io_clone *md_io_clone) +{ + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->start_discard : + mddev->bitmap_ops->start_write; + + if (mddev->pers->bitmap_sector) + mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, + &md_io_clone->sectors); + + fn(mddev, md_io_clone->offset, md_io_clone->sectors); +} + +static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) +{ + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->end_discard : + mddev->bitmap_ops->end_write; + + fn(mddev, md_io_clone->offset, md_io_clone->sectors); +} + static void md_end_clone_io(struct bio *bio) { struct md_io_clone *md_io_clone = bio->bi_private; struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) + md_bitmap_end(mddev, md_io_clone); + if (bio->bi_status && !orig_bio->bi_status) orig_bio->bi_status = bio->bi_status; @@ -8775,6 +9200,13 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio) if (blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone->start_time = bio_start_io_acct(*bio); + if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) { + md_io_clone->offset = (*bio)->bi_iter.bi_sector; + md_io_clone->sectors = bio_sectors(*bio); + md_io_clone->rw = op_stat_group(bio_op(*bio)); + md_bitmap_start(mddev, md_io_clone); + } + clone->bi_end_io = md_end_clone_io; clone->bi_private = md_io_clone; *bio = clone; @@ -8793,6 +9225,9 @@ void md_free_cloned_bio(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) + md_bitmap_end(mddev, md_io_clone); + if (bio->bi_status && !orig_bio->bi_status) orig_bio->bi_status = bio->bi_status; @@ -8856,6 +9291,39 @@ static sector_t md_sync_max_sectors(struct mddev *mddev, } } +/* + * If lazy recovery is requested and all rdevs are in sync, select the rdev with + * the higest index to perfore recovery to build initial xor data, this is the + * same as old bitmap. + */ +static bool mddev_select_lazy_recover_rdev(struct mddev *mddev) +{ + struct md_rdev *recover_rdev = NULL; + struct md_rdev *rdev; + bool ret = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + + if (test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + break; + + if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk) + recover_rdev = rdev; + } + + if (recover_rdev) { + clear_bit(In_sync, &recover_rdev->flags); + ret = true; + } + + rcu_read_unlock(); + return ret; +} + static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) { sector_t start = 0; @@ -8867,7 +9335,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) return mddev->resync_min; case ACTION_RESYNC: if (!mddev->bitmap) - return mddev->recovery_cp; + return mddev->resync_offset; return 0; case ACTION_RESHAPE: /* @@ -8883,14 +9351,18 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) start = MaxSector; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < start) + if (rdev_needs_recovery(rdev, start)) start = rdev->recovery_offset; rcu_read_unlock(); + /* + * If there are no spares, and raid456 lazy initial recover is + * requested. + */ + if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) && + start == MaxSector && mddev_select_lazy_recover_rdev(mddev)) + start = 0; + /* If there is a bitmap, we need to make sure all * writes that started before we added a spare * complete before we start doing a recovery. @@ -8909,6 +9381,16 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) } } +static bool sync_io_within_limit(struct mddev *mddev) +{ + /* + * For raid456, sync IO is stripe(4k) per IO, for other levels, it's + * RESYNC_PAGES(64k) per IO. + */ + return atomic_read(&mddev->recovery_active) < + (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) @@ -8944,7 +9426,7 @@ void md_do_sync(struct md_thread *thread) } if (mddev_is_clustered(mddev)) { - ret = md_cluster_ops->resync_start(mddev); + ret = mddev->cluster_ops->resync_start(mddev); if (ret) goto skip; @@ -8958,6 +9440,11 @@ void md_do_sync(struct md_thread *thread) } action = md_sync_action(mddev); + if (action == ACTION_FROZEN || action == ACTION_IDLE) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto skip; + } + desc = md_sync_action_name(action); mddev->last_sync_action = action; @@ -8971,7 +9458,7 @@ void md_do_sync(struct md_thread *thread) * */ if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_start_notify(mddev); + mddev->cluster_ops->resync_start_notify(mddev); do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; @@ -9088,8 +9575,8 @@ void md_do_sync(struct md_thread *thread) atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && - j > mddev->recovery_cp) - mddev->recovery_cp = j; + j > mddev->resync_offset) + mddev->resync_offset = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9111,6 +9598,12 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; + if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) { + sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j); + if (sectors) + goto update; + } + sectors = mddev->pers->sync_request(mddev, j, max_sectors, &skipped); if (sectors == 0) { @@ -9126,6 +9619,7 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; +update: j += sectors; if (j > max_sectors) /* when skipping, extra large numbers can be returned. */ @@ -9177,7 +9671,8 @@ void md_do_sync(struct md_thread *thread) msleep(500); goto repeat; } - if (!is_mddev_idle(mddev, 0)) { + if (!sync_io_within_limit(mddev) && + !is_mddev_idle(mddev, 0)) { /* * Give other IO more of a chance. * The faster the devices, the less we wait. @@ -9208,19 +9703,19 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - if (mddev->curr_resync >= mddev->recovery_cp) { + if (mddev->curr_resync >= mddev->resync_offset) { pr_debug("md: checkpointing %s of %s.\n", desc, mdname(mddev)); if (test_bit(MD_RECOVERY_ERROR, &mddev->recovery)) - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync_completed; else - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync; } } else - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; @@ -9228,12 +9723,8 @@ void md_do_sync(struct md_thread *thread) test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < mddev->curr_resync) + if (mddev->delta_disks >= 0 && + rdev_needs_recovery(rdev, mddev->curr_resync)) rdev->recovery_offset = mddev->curr_resync; rcu_read_unlock(); } @@ -9324,6 +9815,12 @@ static bool rdev_is_spare(struct md_rdev *rdev) static bool rdev_addable(struct md_rdev *rdev) { + struct mddev *mddev; + + mddev = READ_ONCE(rdev->mddev); + if (!mddev) + return false; + /* rdev is already used, don't add it again. */ if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || test_bit(Faulty, &rdev->flags)) @@ -9334,7 +9831,7 @@ static bool rdev_addable(struct md_rdev *rdev) return true; /* Allow to add if array is read-write. */ - if (md_is_rdwr(rdev->mddev)) + if (md_is_rdwr(mddev)) return true; /* @@ -9362,17 +9859,11 @@ static bool md_spares_need_change(struct mddev *mddev) return false; } -static int remove_and_add_spares(struct mddev *mddev, - struct md_rdev *this) +static int remove_spares(struct mddev *mddev, struct md_rdev *this) { struct md_rdev *rdev; - int spares = 0; int removed = 0; - if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - /* Mustn't remove devices when resync thread is running */ - return 0; - rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev_removeable(rdev) && !mddev->pers->hot_remove_disk(mddev, rdev)) { @@ -9386,6 +9877,21 @@ static int remove_and_add_spares(struct mddev *mddev, if (removed && mddev->kobj.sd) sysfs_notify_dirent_safe(mddev->sysfs_degraded); + return removed; +} + +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this) +{ + struct md_rdev *rdev; + int spares = 0; + int removed = 0; + + if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + /* Mustn't remove devices when resync thread is running */ + return 0; + + removed = remove_spares(mddev, this); if (this && removed) goto no_add; @@ -9423,6 +9929,16 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); + return true; + } + + /* Check if resync is in progress. */ + if (mddev->resync_offset < MaxSector) { + remove_spares(mddev, NULL); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); return true; } @@ -9432,7 +9948,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) * re-add. */ *spares = remove_and_add_spares(mddev, NULL); - if (*spares) { + if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -9442,13 +9958,6 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) return true; } - /* Check if recovery is in progress. */ - if (mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - return true; - } - /* Delay to choose resync/check/repair in md_do_sync(). */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) return true; @@ -9497,7 +10006,7 @@ static void md_start_sync(struct work_struct *ws) * We are adding a device or devices to an array which has the bitmap * stored on all devices. So make sure all bitmap pages get written. */ - if (spares) + if (spares && md_bitmap_enabled(mddev, true)) mddev->bitmap_ops->write_all(mddev); name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? @@ -9561,6 +10070,52 @@ static void unregister_sync_thread(struct mddev *mddev) md_reap_sync_thread(mddev); } +static bool md_should_do_recovery(struct mddev *mddev) +{ + /* + * As long as one of the following flags is set, + * recovery needs to do or cleanup. + */ + if (test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + return true; + + /* + * If no flags are set and it is in read-only status, + * there is nothing to do. + */ + if (!md_is_rdwr(mddev)) + return false; + + /* + * MD_SB_CHANGE_PENDING indicates that the array is switching from clean to + * active, and no action is needed for now. + * All other MD_SB_* flags require to update the superblock. + */ + if (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) + return true; + + /* + * If the array is not using external metadata and there has been no data + * written for some time, then the array's status needs to be set to + * in_sync. + */ + if (mddev->external == 0 && mddev->safemode == 1) + return true; + + /* + * When the system is about to restart or the process receives an signal, + * the array needs to be synchronized as soon as possible. + * Once the data synchronization is completed, need to change the array + * status to in_sync. + */ + if (mddev->safemode == 2 && !mddev->in_sync && + mddev->resync_offset == MaxSector) + return true; + + return false; +} + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -9585,7 +10140,7 @@ static void unregister_sync_thread(struct mddev *mddev) */ void md_check_recovery(struct mddev *mddev) { - if (mddev->bitmap) + if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work) mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { @@ -9597,18 +10152,7 @@ void md_check_recovery(struct mddev *mddev) flush_signals(current); } - if (!md_is_rdwr(mddev) && - !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && - !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) - return; - if ( ! ( - (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || - test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || - test_bit(MD_RECOVERY_DONE, &mddev->recovery) || - (mddev->external == 0 && mddev->safemode == 1) || - (mddev->safemode == 2 - && !mddev->in_sync && mddev->recovery_cp == MaxSector) - )) + if (!md_should_do_recovery(mddev)) return; if (mddev_trylock(mddev)) { @@ -9652,6 +10196,7 @@ void md_check_recovery(struct mddev *mddev) } clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); @@ -9664,8 +10209,8 @@ void md_check_recovery(struct mddev *mddev) * remove disk. */ rdev_for_each_safe(rdev, tmp, mddev) { - if (test_and_clear_bit(ClusterRemove, &rdev->flags) && - rdev->raid_disk < 0) + if (rdev->raid_disk < 0 && + test_and_clear_bit(ClusterRemove, &rdev->flags)) md_kick_rdev_from_array(rdev); } } @@ -9755,21 +10300,22 @@ void md_reap_sync_thread(struct mddev *mddev) * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by * clustered raid */ if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) - md_cluster_ops->resync_finish(mddev); + mddev->cluster_ops->resync_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); /* - * We call md_cluster_ops->update_size here because sync_size could + * We call mddev->cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, * so it is time to update size across cluster. */ if (mddev_is_clustered(mddev) && is_reshaped && !test_bit(MD_CLOSING, &mddev->flags)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9807,12 +10353,11 @@ EXPORT_SYMBOL(md_finish_reshape); /* Bad block management */ -/* Returns 1 on success, 0 on failure */ -int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +/* Returns true on success, false on failure */ +bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { struct mddev *mddev = rdev->mddev; - int rv; /* * Recording new badblocks for faulty rdev will force unnecessary @@ -9822,50 +10367,50 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, * avoid it. */ if (test_bit(Faulty, &rdev->flags)) - return 1; + return true; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_set(&rdev->badblocks, s, sectors, 0); - if (rv == 0) { - /* Make sure they get written out promptly */ - if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); - sysfs_notify_dirent_safe(rdev->sysfs_state); - set_mask_bits(&mddev->sb_flags, 0, - BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); - md_wakeup_thread(rdev->mddev->thread); - return 1; - } else - return 0; + + if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) + return false; + + /* Make sure they get written out promptly */ + if (test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); + md_wakeup_thread(rdev->mddev->thread); + return true; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); -int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { - int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_clear(&rdev->badblocks, s, sectors); - if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) + + if (!badblocks_clear(&rdev->badblocks, s, sectors)) + return; + + if (test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_badblocks); - return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { - struct mddev *mddev, *n; - int need_delay = 0; + struct mddev *mddev; spin_lock(&all_mddevs_lock); - list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); @@ -9876,21 +10421,11 @@ static int md_notify_reboot(struct notifier_block *this, mddev->safemode = 2; mddev_unlock(mddev); } - need_delay = 1; - mddev_put(mddev); spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } spin_unlock(&all_mddevs_lock); - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... - */ - if (need_delay) - msleep(1000); - return NOTIFY_DONE; } @@ -9909,8 +10444,16 @@ static void md_geninit(void) static int __init md_init(void) { - int ret = -ENOMEM; + int ret = md_bitmap_init(); + + if (ret) + return ret; + + ret = md_llbitmap_init(); + if (ret) + goto err_bitmap; + ret = -ENOMEM; md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) goto err_wq; @@ -9919,11 +10462,6 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, - 0); - if (!md_bitmap_wq) - goto err_bitmap_wq; - ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; @@ -9942,12 +10480,13 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_bitmap_wq); -err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); err_wq: + md_llbitmap_exit(); +err_bitmap: + md_bitmap_exit(); return ret; } @@ -9965,14 +10504,17 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); if (ret) pr_info("md-cluster: resize failed\n"); - else + else if (md_bitmap_enabled(mddev, false)) mddev->bitmap_ops->update_sb(mddev->bitmap); } /* Check for change of roles in the active devices */ rdev_for_each_safe(rdev2, tmp, mddev) { - if (test_bit(Faulty, &rdev2->flags)) + if (test_bit(Faulty, &rdev2->flags)) { + if (test_bit(ClusterRemove, &rdev2->flags)) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); continue; + } /* Check if the roles changed */ role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); @@ -9995,7 +10537,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && - !md_cluster_ops->resync_status_get(mddev)) { + !mddev->cluster_ops->resync_status_get(mddev)) { /* * -1 to make raid1_add_disk() set conf->fullsync * to 1. This could avoid skipping sync when the @@ -10211,7 +10753,7 @@ void md_autostart_arrays(int part) static __exit void md_exit(void) { - struct mddev *mddev, *n; + struct mddev *mddev; int delay = 1; unregister_blkdev(MD_MAJOR,"md"); @@ -10232,7 +10774,7 @@ static __exit void md_exit(void) remove_proc_entry("mdstat", NULL); spin_lock(&all_mddevs_lock); - list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); @@ -10244,14 +10786,14 @@ static __exit void md_exit(void) * the mddev for destruction by a workqueue, and the * destroy_workqueue() below will wait for that to complete. */ - mddev_put(mddev); spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); - destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); + md_bitmap_exit(); } subsys_initcall(md_init); @@ -10270,6 +10812,8 @@ module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); module_param(create_on_open, bool, S_IRUSR|S_IWUSR); +module_param(legacy_async_del_gendisk, bool, 0600); +module_param(check_new_feature, bool, 0600); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); diff --git a/drivers/md/md.h b/drivers/md/md.h index 4ba93af36126..6985f2829bbd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -18,11 +18,38 @@ #include <linux/timer.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <linux/raid/md_u.h> #include <trace/events/block.h> -#include "md-cluster.h" #define MaxSector (~(sector_t)0) +enum md_submodule_type { + MD_PERSONALITY = 0, + MD_CLUSTER, + MD_BITMAP, +}; + +enum md_submodule_id { + ID_LINEAR = LEVEL_LINEAR, + ID_RAID0 = 0, + ID_RAID1 = 1, + ID_RAID4 = 4, + ID_RAID5 = 5, + ID_RAID6 = 6, + ID_RAID10 = 10, + ID_CLUSTER, + ID_BITMAP, + ID_LLBITMAP, + ID_BITMAP_NONE, +}; + +struct md_submodule_head { + enum md_submodule_type type; + enum md_submodule_id id; + const char *name; + struct module *owner; +}; + /* * These flags should really be called "NO_RETRY" rather than * "FAILFAST" because they don't make any promise about time lapse, @@ -106,7 +133,7 @@ struct md_rdev { sector_t sectors; /* Device size (in 512bytes sectors) */ struct mddev *mddev; /* RAID array if running */ - int last_events; /* IO event timestamp */ + unsigned long last_events; /* IO event timestamp */ /* * If meta_bdev is non-NULL, it means that a separate device is @@ -266,8 +293,8 @@ enum flag_bits { Nonrot, /* non-rotational device (SSD) */ }; -static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { if (unlikely(rdev->badblocks.count)) { int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, @@ -284,16 +311,17 @@ static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s, int sectors) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors); } -extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new); -extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new); +extern bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); +extern void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); struct md_cluster_info; +struct md_cluster_operations; /** * enum mddev_flags - md device flags. @@ -326,6 +354,7 @@ enum mddev_flags { MD_HAS_MULTIPLE_PPLS, MD_NOT_READY, MD_BROKEN, + MD_DO_DELETE, MD_DELETED, }; @@ -377,7 +406,8 @@ struct mddev { * are happening, so run/ * takeover/stop are not safe */ - struct gendisk *gendisk; + struct gendisk *gendisk; /* mdraid gendisk */ + struct gendisk *dm_gendisk; /* dm-raid gendisk */ struct kobject kobj; int hold_active; @@ -403,6 +433,7 @@ struct mddev { sector_t array_sectors; /* exported array size */ int external_size; /* size managed * externally */ + unsigned int logical_block_size; __u64 events; /* If the last 'event' was simply a clean->dirty transition, and * we didn't write it to the spares, then it is safe and simple @@ -456,6 +487,7 @@ struct mddev { /* if zero, use the system-wide default */ int sync_speed_min; int sync_speed_max; + int sync_io_depth; /* resync even though the same disks are shared among md-devices */ int parallel_resync; @@ -491,9 +523,10 @@ struct mddev { * adding a spare */ + unsigned long normal_io_events; /* IO event timestamp */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; - sector_t recovery_cp; + sector_t resync_offset; sector_t resync_min; /* user requested sync * starts here */ sector_t resync_max; /* resync should pause @@ -535,6 +568,7 @@ struct mddev { struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ + enum md_submodule_id bitmap_id; void *bitmap; /* the bitmap for the device */ struct bitmap_operations *bitmap_ops; struct { @@ -576,6 +610,7 @@ struct mddev { mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; + struct md_cluster_operations *cluster_ops; unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ @@ -634,6 +669,8 @@ enum recovery_flags { MD_RECOVERY_RESHAPE, /* remote node is running resync thread */ MD_RESYNCING_REMOTE, + /* raid456 lazy initial recover */ + MD_RECOVERY_LAZY_RECOVER, }; enum md_ro_state { @@ -669,11 +706,26 @@ static inline bool reshape_interrupted(struct mddev *mddev) static inline int __must_check mddev_lock(struct mddev *mddev) { - return mutex_lock_interruptible(&mddev->reconfig_mutex); + int ret; + + ret = mutex_lock_interruptible(&mddev->reconfig_mutex); + + /* MD_DELETED is set in do_md_stop with reconfig_mutex. + * So check it here. + */ + if (!ret && test_bit(MD_DELETED, &mddev->flags)) { + ret = -ENODEV; + mutex_unlock(&mddev->reconfig_mutex); + } + + return ret; } /* Sometimes we need to take the lock in a situation where * failure due to interrupts is not acceptable. + * It doesn't need to check MD_DELETED here, the owner which + * holds the lock here can't be stopped. And all paths can't + * call this function after do_md_stop. */ static inline void mddev_lock_nointr(struct mddev *mddev) { @@ -682,27 +734,21 @@ static inline void mddev_lock_nointr(struct mddev *mddev) static inline int mddev_trylock(struct mddev *mddev) { - return mutex_trylock(&mddev->reconfig_mutex); -} -extern void mddev_unlock(struct mddev *mddev); - -static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) -{ - if (blk_queue_io_stat(bdev->bd_disk->queue)) - atomic_add(nr_sectors, &bdev->bd_disk->sync_io); -} + int ret; -static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) -{ - md_sync_acct(bio->bi_bdev, nr_sectors); + ret = mutex_trylock(&mddev->reconfig_mutex); + if (!ret && test_bit(MD_DELETED, &mddev->flags)) { + ret = -ENODEV; + mutex_unlock(&mddev->reconfig_mutex); + } + return ret; } +extern void mddev_unlock(struct mddev *mddev); struct md_personality { - char *name; - int level; - struct list_head list; - struct module *owner; + struct md_submodule_head head; + bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); /* * start up works that do NOT require md_thread. tasks that @@ -746,6 +792,9 @@ struct md_personality void *(*takeover) (struct mddev *mddev); /* Changes the consistency policy of an active array. */ int (*change_consistency_policy)(struct mddev *mddev, const char *buf); + /* convert io ranges from array to bitmap */ + void (*bitmap_sector)(struct mddev *mddev, sector_t *offset, + unsigned long *sectors); }; struct md_sysfs_entry { @@ -753,7 +802,6 @@ struct md_sysfs_entry { ssize_t (*show)(struct mddev *, char *); ssize_t (*store)(struct mddev *, const char *, size_t); }; -extern const struct attribute_group md_bitmap_group; static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) { @@ -828,29 +876,34 @@ struct md_io_clone { struct mddev *mddev; struct bio *orig_bio; unsigned long start_time; + sector_t offset; + unsigned long sectors; + enum stat_group rw; struct bio bio_clone; }; #define THREAD_WAKEUP 0 +#define md_wakeup_thread(thread) do { \ + rcu_read_lock(); \ + __md_wakeup_thread(thread); \ + rcu_read_unlock(); \ +} while (0) + static inline void safe_put_page(struct page *p) { if (p) put_page(p); } -extern int register_md_personality(struct md_personality *p); -extern int unregister_md_personality(struct md_personality *p); -extern int register_md_cluster_operations(const struct md_cluster_operations *ops, - struct module *module); -extern int unregister_md_cluster_operations(void); -extern int md_setup_cluster(struct mddev *mddev, int nodes); -extern void md_cluster_stop(struct mddev *mddev); +int register_md_submodule(struct md_submodule_head *msh); +void unregister_md_submodule(struct md_submodule_head *msh); + extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp); -extern void md_wakeup_thread(struct md_thread __rcu *thread); +extern void __md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern enum sync_action md_sync_action(struct mddev *mddev); @@ -868,8 +921,9 @@ void md_account_bio(struct mddev *mddev, struct bio **bio); void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); -extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page); +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset); extern int md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op); @@ -901,7 +955,6 @@ extern void md_idle_sync_thread(struct mddev *mddev); extern void md_frozen_sync_thread(struct mddev *mddev); extern void md_unfrozen_sync_thread(struct mddev *mddev); -extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev); extern void mddev_destroy_serial_pool(struct mddev *mddev, @@ -923,7 +976,6 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) } } -extern const struct md_cluster_operations *md_cluster_ops; static inline int mddev_is_clustered(struct mddev *mddev) { return mddev->cluster_info && mddev->bitmap_info.nodes > 1; @@ -974,7 +1026,6 @@ struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; -extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); @@ -995,6 +1046,12 @@ static inline bool mddev_is_dm(struct mddev *mddev) return !mddev->gendisk; } +static inline bool raid_is_456(struct mddev *mddev) +{ + return mddev->level == ID_RAID4 || mddev->level == ID_RAID5 || + mddev->level == ID_RAID6; +} + static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, sector_t sector) { diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index f4f948b0e173..dbb97a7233ab 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig @@ -2,7 +2,7 @@ config DM_PERSISTENT_DATA tristate depends on BLK_DEV_DM - select LIBCRC32C + select CRC32 select DM_BUFIO help Library providing immutable on-disk data structure support for diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 157c9bd2fed7..8f8792e55806 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -917,23 +917,27 @@ static int load_ablock(struct dm_array_cursor *c) if (c->block) unlock_ablock(c->info, c->block); - c->block = NULL; - c->ab = NULL; c->index = 0; r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le); if (r) { DMERR("dm_btree_cursor_get_value failed"); - dm_btree_cursor_end(&c->cursor); + goto out; } else { r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab); if (r) { DMERR("get_ablock failed"); - dm_btree_cursor_end(&c->cursor); + goto out; } } + return 0; + +out: + dm_btree_cursor_end(&c->cursor); + c->block = NULL; + c->ab = NULL; return r; } @@ -956,10 +960,10 @@ EXPORT_SYMBOL_GPL(dm_array_cursor_begin); void dm_array_cursor_end(struct dm_array_cursor *c) { - if (c->block) { + if (c->block) unlock_ablock(c->info, c->block); - dm_btree_cursor_end(&c->cursor); - } + + dm_btree_cursor_end(&c->cursor); } EXPORT_SYMBOL_GPL(dm_array_cursor_end); @@ -999,6 +1003,7 @@ int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count) } count -= remaining; + c->index += (remaining - 1); r = dm_array_cursor_next(c); } while (!r); diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index c7ba4e6cbbc7..98c745d90f48 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -13,6 +13,7 @@ #include <linux/export.h> #include <linux/mutex.h> #include <linux/hash.h> +#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/device-mapper.h> @@ -77,7 +78,7 @@ static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm) /*----------------------------------------------------------------*/ struct shadow_info { - struct hlist_node hlist; + struct rb_node node; dm_block_t where; }; @@ -95,7 +96,7 @@ struct dm_transaction_manager { struct dm_space_map *sm; spinlock_t lock; - struct hlist_head buckets[DM_HASH_SIZE]; + struct rb_root buckets[DM_HASH_SIZE]; struct prefetch_set prefetches; }; @@ -106,14 +107,22 @@ static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) { int r = 0; unsigned int bucket = dm_hash_block(b, DM_HASH_MASK); - struct shadow_info *si; + struct rb_node **node; spin_lock(&tm->lock); - hlist_for_each_entry(si, tm->buckets + bucket, hlist) - if (si->where == b) { + node = &tm->buckets[bucket].rb_node; + while (*node) { + struct shadow_info *si = + rb_entry(*node, struct shadow_info, node); + if (b == si->where) { r = 1; break; } + if (b < si->where) + node = &si->node.rb_left; + else + node = &si->node.rb_right; + } spin_unlock(&tm->lock); return r; @@ -130,30 +139,41 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) si = kmalloc(sizeof(*si), GFP_NOIO); if (si) { + struct rb_node **node, *parent; si->where = b; bucket = dm_hash_block(b, DM_HASH_MASK); + spin_lock(&tm->lock); - hlist_add_head(&si->hlist, tm->buckets + bucket); + node = &tm->buckets[bucket].rb_node; + parent = NULL; + while (*node) { + struct shadow_info *si = + rb_entry(*node, struct shadow_info, node); + parent = *node; + if (b < si->where) + node = &si->node.rb_left; + else + node = &si->node.rb_right; + } + rb_link_node(&si->node, parent, node); + rb_insert_color(&si->node, &tm->buckets[bucket]); spin_unlock(&tm->lock); } } static void wipe_shadow_table(struct dm_transaction_manager *tm) { - struct shadow_info *si; - struct hlist_node *tmp; - struct hlist_head *bucket; - int i; + unsigned int i; spin_lock(&tm->lock); for (i = 0; i < DM_HASH_SIZE; i++) { - bucket = tm->buckets + i; - hlist_for_each_entry_safe(si, tmp, bucket, hlist) + while (!RB_EMPTY_ROOT(&tm->buckets[i])) { + struct shadow_info *si = + rb_entry(tm->buckets[i].rb_node, struct shadow_info, node); + rb_erase(&si->node, &tm->buckets[i]); kfree(si); - - INIT_HLIST_HEAD(bucket); + } } - spin_unlock(&tm->lock); } @@ -162,7 +182,7 @@ static void wipe_shadow_table(struct dm_transaction_manager *tm) static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, struct dm_space_map *sm) { - int i; + unsigned int i; struct dm_transaction_manager *tm; tm = kmalloc(sizeof(*tm), GFP_KERNEL); @@ -176,7 +196,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, spin_lock_init(&tm->lock); for (i = 0; i < DM_HASH_SIZE; i++) - INIT_HLIST_HEAD(tm->buckets + i); + tm->buckets[i] = RB_ROOT; prefetch_init(&tm->prefetches); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index baaf5f8b80ae..985c377356eb 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -68,7 +68,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) struct strip_zone *zone; int cnt; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - unsigned blksize = 512; + unsigned int blksize = 512; + + if (!mddev_is_dm(mddev)) + blksize = queue_logical_block_size(mddev->gendisk->queue); *private_conf = ERR_PTR(-ENOMEM); if (!conf) @@ -84,7 +87,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) sector_div(sectors, mddev->chunk_sectors); rdev1->sectors = sectors * mddev->chunk_sectors; - blksize = max(blksize, queue_logical_block_size( + if (mddev_is_dm(mddev)) + blksize = max(blksize, queue_logical_block_size( rdev1->bdev->bd_disk->queue)); rdev_for_each(rdev2, mddev) { @@ -382,13 +386,15 @@ static int raid0_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; + lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; + lim.chunk_sectors = mddev->chunk_sectors; + lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); - if (err) { - queue_limits_cancel_update(mddev->gendisk->queue); + if (err) return err; - } return queue_limits_set(mddev->gendisk->queue, &lim); } @@ -404,6 +410,12 @@ static int raid0_run(struct mddev *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; + if (!mddev_is_dm(mddev)) { + ret = raid0_set_limits(mddev); + if (ret) + return ret; + } + /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { ret = create_strip_zones(mddev, &conf); @@ -412,11 +424,6 @@ static int raid0_run(struct mddev *mddev) mddev->private = conf; } conf = mddev->private; - if (!mddev_is_dm(mddev)) { - ret = raid0_set_limits(mddev); - if (ret) - return ret; - } /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); @@ -463,21 +470,16 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) zone = find_zone(conf, &start); if (bio_end_sector(bio) > zone->zone_end) { - struct bio *split = bio_split(bio, - zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, - &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + bio = bio_submit_split_bioset(bio, + zone->zone_end - bio->bi_iter.bi_sector, + &mddev->bio_set); + if (!bio) return; - } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + end = zone->zone_end; - } else + } else { end = bio_end_sector(bio); + } orig_end = end; if (zone != conf->strip_zone) @@ -612,17 +614,10 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) : sector_div(sector, chunk_sects)); if (sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, sectors, GFP_NOIO, + bio = bio_submit_split_bioset(bio, sectors, &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + if (!bio) return true; - } - bio_chain(split, bio); - raid0_map_submit_bio(mddev, bio); - bio = split; } raid0_map_submit_bio(mddev, bio); @@ -674,7 +669,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) mddev->raid_disks--; mddev->delta_disks = -1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); @@ -717,7 +712,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev) mddev->raid_disks += mddev->delta_disks; mddev->degraded = 0; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); @@ -760,7 +755,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) mddev->delta_disks = 1 - mddev->raid_disks; mddev->raid_disks = 1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); @@ -810,9 +805,13 @@ static void raid0_quiesce(struct mddev *mddev, int quiesce) static struct md_personality raid0_personality= { - .name = "raid0", - .level = 0, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID0, + .name = "raid0", + .owner = THIS_MODULE, + }, + .make_request = raid0_make_request, .run = raid0_run, .free = raid0_free, @@ -823,14 +822,14 @@ static struct md_personality raid0_personality= .error_handler = raid0_error, }; -static int __init raid0_init (void) +static int __init raid0_init(void) { - return register_md_personality (&raid0_personality); + return register_md_submodule(&raid0_personality.head); } -static void raid0_exit (void) +static void __exit raid0_exit(void) { - unregister_md_personality (&raid0_personality); + unregister_md_submodule(&raid0_personality.head); } module_init(raid0_init); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 4378d3250bd7..521625756128 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * If bitmap is not enabled, it's safe to submit the io directly, and * this can get optimal performance. */ - if (!mddev->bitmap_ops->enabled(mddev)) { + if (!md_bitmap_enabled(mddev, true)) { raid1_submit_write(bio); return true; } @@ -247,7 +247,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev, sector_t this_sector, int *len) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; /* no bad block overlap */ if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors)) @@ -283,13 +283,23 @@ static inline int raid1_check_read_range(struct md_rdev *rdev, static inline bool raid1_should_read_first(struct mddev *mddev, sector_t this_sector, int len) { - if ((mddev->recovery_cp < this_sector + len)) + if ((mddev->resync_offset < this_sector + len)) return true; if (mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, READ, this_sector, - this_sector + len)) + mddev->cluster_ops->area_resyncing(mddev, READ, this_sector, + this_sector + len)) return true; return false; } + +/* + * bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is + * submitted to the underlying disks, hence don't record badblocks or retry + * in this case. + */ +static inline bool raid1_should_handle_error(struct bio *bio) +{ + return !(bio->bi_opf & (REQ_RAHEAD | REQ_NOWAIT)); +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a5adf08ee174..57d50465eed1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -36,6 +36,7 @@ #include "md.h" #include "raid1.h" #include "md-bitmap.h" +#include "md-cluster.h" #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ @@ -45,6 +46,7 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr); static void lower_barrier(struct r1conf *conf, sector_t sector_nr); +static void raid1_free(struct mddev *mddev, void *priv); #define RAID_1_10_NAME "raid1" #include "raid1-10.c" @@ -125,10 +127,9 @@ static inline struct r1bio *get_resync_r1bio(struct bio *bio) return get_resync_pages(bio)->raid_bio; } -static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) +static void *r1bio_pool_alloc(gfp_t gfp_flags, struct r1conf *conf) { - struct pool_info *pi = data; - int size = offsetof(struct r1bio, bios[pi->raid_disks]); + int size = offsetof(struct r1bio, bios[conf->raid_disks * 2]); /* allocate a r1bio with room for raid_disks entries in the bios array */ return kzalloc(size, gfp_flags); @@ -143,18 +144,18 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { - struct pool_info *pi = data; + struct r1conf *conf = data; struct r1bio *r1_bio; struct bio *bio; int need_pages; int j; struct resync_pages *rps; - r1_bio = r1bio_pool_alloc(gfp_flags, pi); + r1_bio = r1bio_pool_alloc(gfp_flags, conf); if (!r1_bio) return NULL; - rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages), + rps = kmalloc_array(conf->raid_disks * 2, sizeof(struct resync_pages), gfp_flags); if (!rps) goto out_free_r1bio; @@ -162,11 +163,11 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) /* * Allocate bios : 1 for reading, n-1 for writing */ - for (j = pi->raid_disks ; j-- ; ) { + for (j = conf->raid_disks * 2; j-- ; ) { bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r1_bio->bios[j] = bio; } /* @@ -175,11 +176,11 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) * If this is a user-requested check/repair, allocate * RESYNC_PAGES for each bio. */ - if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) - need_pages = pi->raid_disks; + if (test_bit(MD_RECOVERY_REQUESTED, &conf->mddev->recovery)) + need_pages = conf->raid_disks * 2; else need_pages = 1; - for (j = 0; j < pi->raid_disks; j++) { + for (j = 0; j < conf->raid_disks * 2; j++) { struct resync_pages *rp = &rps[j]; bio = r1_bio->bios[j]; @@ -205,7 +206,7 @@ out_free_pages: resync_free_pages(&rps[j]); out_free_bio: - while (++j < pi->raid_disks) { + while (++j < conf->raid_disks * 2) { bio_uninit(r1_bio->bios[j]); kfree(r1_bio->bios[j]); } @@ -218,12 +219,12 @@ out_free_r1bio: static void r1buf_pool_free(void *__r1_bio, void *data) { - struct pool_info *pi = data; + struct r1conf *conf = data; int i; struct r1bio *r1bio = __r1_bio; struct resync_pages *rp = NULL; - for (i = pi->raid_disks; i--; ) { + for (i = conf->raid_disks * 2; i--; ) { rp = get_resync_pages(r1bio->bios[i]); resync_free_pages(rp); bio_uninit(r1bio->bios[i]); @@ -253,7 +254,7 @@ static void free_r1bio(struct r1bio *r1_bio) struct r1conf *conf = r1_bio->mddev->private; put_all_bios(conf, r1_bio); - mempool_free(r1_bio, &conf->r1bio_pool); + mempool_free(r1_bio, conf->r1bio_pool); } static void put_buf(struct r1bio *r1_bio) @@ -371,14 +372,16 @@ static void raid1_end_read_request(struct bio *bio) */ update_head_pos(r1_bio->read_disk, r1_bio); - if (uptodate) + if (uptodate) { set_bit(R1BIO_Uptodate, &r1_bio->state); - else if (test_bit(FailFast, &rdev->flags) && - test_bit(R1BIO_FailFast, &r1_bio->state)) + } else if (test_bit(FailFast, &rdev->flags) && + test_bit(R1BIO_FailFast, &r1_bio->state)) { /* This was a fail-fast read so we definitely * want to retry */ ; - else { + } else if (!raid1_should_handle_error(bio)) { + uptodate = 1; + } else { /* If all other devices have failed, we want to return * the error upwards rather than fail the last device. * Here we redefine "uptodate" to mean "Don't want to retry" @@ -420,10 +423,8 @@ static void close_write(struct r1bio *r1_bio) r1_bio->behind_master_bio = NULL; } - /* clear the bitmap if all writes complete successfully */ - mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - test_bit(R1BIO_BehindIO, &r1_bio->state)); + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + mddev->bitmap_ops->end_behind_write(mddev); md_write_end(mddev); } @@ -451,16 +452,15 @@ static void raid1_end_write_request(struct bio *bio) struct bio *to_put = NULL; int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; - bool discard_error; sector_t lo = r1_bio->sector; sector_t hi = r1_bio->sector + r1_bio->sectors; - - discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; + bool ignore_error = !raid1_should_handle_error(bio) || + (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); /* * 'one mirror IO has finished' event handler: */ - if (bio->bi_status && !discard_error) { + if (bio->bi_status && !ignore_error) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & @@ -480,8 +480,6 @@ static void raid1_end_write_request(struct bio *bio) if (!test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_WriteError, &r1_bio->state); else { - /* Fail the request */ - set_bit(R1BIO_Degraded, &r1_bio->state); /* Finished with this branch */ r1_bio->bios[mirror] = NULL; to_put = bio; @@ -513,7 +511,7 @@ static void raid1_end_write_request(struct bio *bio) /* Maybe we can clear some bad blocks. */ if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && - !discard_error) { + !ignore_error) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); } @@ -1227,7 +1225,7 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio, int i = 0; struct bio *behind_bio = NULL; - behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO, + behind_bio = bio_alloc_bioset(NULL, vcnt, bio->bi_opf, GFP_NOIO, &r1_bio->mddev->bio_set); /* discard op, we don't support writezero/writesame yet */ @@ -1306,9 +1304,8 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio) struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO); - /* Ensure no bio records IO_BLOCKED */ - memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0])); + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + memset(r1_bio, 0, offsetof(struct r1bio, bios[conf->raid_disks * 2])); init_r1bio(r1_bio, mddev, bio); return r1_bio; } @@ -1319,10 +1316,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct r1conf *conf = mddev->private; struct raid1_info *mirror; struct bio *read_bio; - const enum req_op op = bio_op(bio); - const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; - int rdisk, error; + int rdisk; bool r1bio_existed = !!r1_bio; /* @@ -1371,7 +1366,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, (unsigned long long)r1_bio->sector, mirror->rdev->bdev); - if (test_bit(WriteMostly, &mirror->rdev->flags)) { + if (test_bit(WriteMostly, &mirror->rdev->flags) && + md_bitmap_enabled(mddev, false)) { /* * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' @@ -1381,16 +1377,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1402,13 +1395,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp, &mddev->bio_set); - + read_bio->bi_opf &= ~REQ_NOWAIT; r1_bio->bios[rdisk] = read_bio; read_bio->bi_iter.bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_end_io = raid1_end_read_request; - read_bio->bi_opf = op | do_sync; if (test_bit(FailFast, &mirror->rdev->flags) && test_bit(R1BIO_FailFast, &r1_bio->state)) read_bio->bi_opf |= MD_FAILFAST; @@ -1419,8 +1411,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, err_handle: atomic_dec(&mirror->rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } @@ -1458,12 +1448,36 @@ retry: return true; } +static void raid1_start_write_behind(struct mddev *mddev, struct r1bio *r1_bio, + struct bio *bio) +{ + unsigned long max_write_behind = mddev->bitmap_info.max_write_behind; + struct md_bitmap_stats stats; + int err; + + /* behind write rely on bitmap, see bitmap_operations */ + if (!md_bitmap_enabled(mddev, false)) + return; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return; + + /* Don't do behind IO if reader is waiting, or there are too many. */ + if (!stats.behind_wait && stats.behind_writes < max_write_behind) + alloc_behind_master_bio(r1_bio, bio); + + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + mddev->bitmap_ops->start_behind_write(mddev); + +} + static void raid1_write_request(struct mddev *mddev, struct bio *bio, int max_write_sectors) { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - int i, disks, k, error; + int i, disks, k; unsigned long flags; int first_clone; int max_sectors; @@ -1471,7 +1485,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, bool is_discard = (bio_op(bio) == REQ_OP_DISCARD); if (mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, WRITE, + mddev->cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) { DEFINE_WAIT(w); @@ -1482,7 +1496,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (;;) { prepare_to_wait(&conf->wait_barrier, &w, TASK_IDLE); - if (!md_cluster_ops->area_resyncing(mddev, WRITE, + if (!mddev->cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) break; @@ -1535,16 +1549,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, write_behind = true; r1_bio->bios[i] = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { - if (i < conf->raid_disks) - set_bit(R1BIO_Degraded, &r1_bio->state); + if (!rdev || test_bit(Faulty, &rdev->flags)) continue; - } atomic_inc(&rdev->nr_pending); if (test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; int is_bad; is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, @@ -1558,20 +1569,22 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ max_sectors = bad_sectors; rdev_dec_pending(rdev, mddev); - /* We don't set R1BIO_Degraded as that - * only applies if the disk is - * missing, so it might be re-added, - * and we want to know to recover this - * chunk. - * In this case the device is here, - * and the fact that this chunk is not - * in-sync is recorded in the bad - * block log - */ continue; } if (is_bad) { - int good_sectors = first_bad - r1_bio->sector; + int good_sectors; + + /* + * We cannot atomically write this, so just + * error in that case. It could be possible to + * atomically write other mirrors, but the + * complexity of supporting that is not worth + * the benefit. + */ + if (bio->bi_opf & REQ_ATOMIC) + goto err_handle; + + good_sectors = first_bad - r1_bio->sector; if (good_sectors < max_sectors) max_sectors = good_sectors; } @@ -1589,16 +1602,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, max_sectors = min_t(int, max_sectors, BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - GFP_NOIO, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1617,23 +1627,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, continue; if (first_clone) { - unsigned long max_write_behind = - mddev->bitmap_info.max_write_behind; - struct md_bitmap_stats stats; - int err; - - /* do behind I/O ? - * Not if there are too many, or cannot - * allocate memory, or a reader on WriteMostly - * is waiting for behind writes to flush */ - err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); - if (!err && write_behind && !stats.behind_wait && - stats.behind_writes < max_write_behind) - alloc_behind_master_bio(r1_bio, bio); - - mddev->bitmap_ops->startwrite( - mddev, r1_bio->sector, r1_bio->sectors, - test_bit(R1BIO_BehindIO, &r1_bio->state)); + if (write_behind) + raid1_start_write_behind(mddev, r1_bio, bio); first_clone = 0; } @@ -1653,11 +1648,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, wait_for_serialization(rdev, r1_bio); } + mbio->bi_opf &= ~REQ_NOWAIT; r1_bio->bios[i] = mbio; mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset); mbio->bi_end_io = raid1_end_write_request; - mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); if (test_bit(FailFast, &rdev->flags) && !test_bit(WriteMostly, &rdev->flags) && conf->raid_disks - mddev->degraded > 1) @@ -1689,8 +1684,6 @@ err_handle: } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } @@ -2063,7 +2056,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) /* make sure these bits don't get cleared. */ do { - mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks); + md_bitmap_end_sync(mddev, s, &sync_blocks); s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); @@ -2206,14 +2199,9 @@ static int fix_sync_read_error(struct r1bio *r1_bio) if (!rdev_set_badblocks(rdev, sect, s, 0)) abort = 1; } - if (abort) { - conf->recovery_disabled = - mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_done_sync(mddev, r1_bio->sectors, 0); - put_buf(r1_bio); + if (abort) return 0; - } + /* Try next page */ sectors -= s; sect += s; @@ -2352,10 +2340,21 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) int disks = conf->raid_disks * 2; struct bio *wbio; - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) - /* ouch - failed to read all of that. */ - if (!fix_sync_read_error(r1_bio)) + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* + * ouch - failed to read all of that. + * No need to fix read error for check/repair + * because all member disks are read. + */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) || + !fix_sync_read_error(r1_bio)) { + conf->recovery_disabled = mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_done_sync(mddev, r1_bio->sectors, 0); + put_buf(r1_bio); return; + } + } if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) process_checks(r1_bio); @@ -2382,7 +2381,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) wbio->bi_end_io = end_sync_write; atomic_inc(&r1_bio->remaining); - md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); submit_bio_noacct(wbio); } @@ -2489,7 +2487,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) } } -static int narrow_write_error(struct r1bio *r1_bio, int i) +static bool narrow_write_error(struct r1bio *r1_bio, int i) { struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; @@ -2510,10 +2508,10 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) sector_t sector; int sectors; int sect_to_write = r1_bio->sectors; - int ok = 1; + bool ok = true; if (rdev->badblocks.shift < 0) - return 0; + return false; block_sectors = roundup(1 << rdev->badblocks.shift, bdev_logical_block_size(rdev->bdev) >> 9); @@ -2599,12 +2597,10 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) * errors. */ fail = true; - if (!narrow_write_error(r1_bio, m)) { + if (!narrow_write_error(r1_bio, m)) md_error(conf->mddev, conf->mirrors[m].rdev); /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } rdev_dec_pending(conf->mirrors[m].rdev, conf->mddev); } @@ -2695,8 +2691,6 @@ static void raid1d(struct md_thread *thread) list_del(&r1_bio->retry_list); idx = sector_to_idx(r1_bio->sector); atomic_dec(&conf->nr_queued[idx]); - if (mddev->degraded) - set_bit(R1BIO_Degraded, &r1_bio->state); if (test_bit(R1BIO_WriteError, &r1_bio->state)) close_write(r1_bio); raid_end_bio_io(r1_bio); @@ -2750,7 +2744,7 @@ static int init_resync(struct r1conf *conf) BUG_ON(mempool_initialized(&conf->r1buf_pool)); return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc, - r1buf_pool_free, conf->poolinfo); + r1buf_pool_free, conf); } static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) @@ -2760,7 +2754,7 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) struct bio *bio; int i; - for (i = conf->poolinfo->raid_disks; i--; ) { + for (i = conf->raid_disks * 2; i--; ) { bio = r1bio->bios[i]; rps = bio->bi_private; bio_reset(bio, NULL, 0); @@ -2809,12 +2803,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * We can find the current addess in mddev->curr_resync */ if (mddev->curr_resync < max_sector) /* aborted */ - mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); if (mddev_is_clustered(mddev)) { @@ -2825,7 +2820,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } if (mddev->bitmap == NULL && - mddev->recovery_cp == MaxSector && + mddev->resync_offset == MaxSector && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && conf->fullsync == 0) { *skipped = 1; @@ -2834,7 +2829,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) && + if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ *skipped = 1; @@ -2851,10 +2846,11 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* we are incrementing sector_nr below. To be safe, we check against * sector_nr + two times RESYNC_SECTORS */ - - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, - mddev_is_clustered(mddev) && - (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > + conf->cluster_sync_high)); if (raise_barrier(conf, sector_nr)) return 0; @@ -2893,7 +2889,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } else { /* may need to read from here */ sector_t first_bad = MaxSector; - int bad_sectors; + sector_t bad_sectors; if (is_badblock(rdev, sector_nr, good_sectors, &first_bad, &bad_sectors)) { @@ -3009,8 +3005,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; if (sync_blocks == 0) { - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, - &sync_blocks, still_degraded) && + if (!md_bitmap_start_sync(mddev, sector_nr, + &sync_blocks, still_degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; @@ -3045,9 +3041,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, conf->cluster_sync_low = mddev->curr_resync_completed; conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; /* Send resync message */ - md_cluster_ops->resync_info_update(mddev, - conf->cluster_sync_low, - conf->cluster_sync_high); + mddev->cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); } /* For a user-requested sync, we read all readable devices and do a @@ -3059,7 +3055,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { read_targets--; - md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; submit_bio_noacct(bio); @@ -3068,7 +3063,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } else { atomic_set(&r1_bio->remaining, 1); bio = r1_bio->bios[r1_bio->read_disk]; - md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; submit_bio_noacct(bio); @@ -3090,6 +3084,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) int i; struct raid1_info *disk; struct md_rdev *rdev; + size_t r1bio_size; int err = -ENOMEM; conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL); @@ -3126,21 +3121,15 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf->tmppage) goto abort; - conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); - if (!conf->poolinfo) - goto abort; - conf->poolinfo->raid_disks = mddev->raid_disks * 2; - err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc, - rbio_pool_free, conf->poolinfo); - if (err) + r1bio_size = offsetof(struct r1bio, bios[mddev->raid_disks * 2]); + conf->r1bio_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, r1bio_size); + if (!conf->r1bio_pool) goto abort; err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); if (err) goto abort; - conf->poolinfo->mddev = mddev; - err = -EINVAL; spin_lock_init(&conf->device_lock); conf->raid_disks = mddev->raid_disks; @@ -3203,10 +3192,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) abort: if (conf) { - mempool_exit(&conf->r1bio_pool); + mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); - kfree(conf->poolinfo); kfree(conf->nr_pending); kfree(conf->nr_waiting); kfree(conf->nr_queued); @@ -3224,11 +3212,12 @@ static int raid1_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; + lim.logical_block_size = mddev->logical_block_size; + lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); - if (err) { - queue_limits_cancel_update(mddev->gendisk->queue); + if (err) return err; - } return queue_limits_set(mddev->gendisk->queue, &lim); } @@ -3264,8 +3253,11 @@ static int raid1_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid1_set_limits(mddev); - if (ret) + if (ret) { + if (!mddev->private) + raid1_free(mddev, conf); return ret; + } } mddev->degraded = 0; @@ -3279,13 +3271,15 @@ static int raid1_run(struct mddev *mddev) */ if (conf->raid_disks - mddev->degraded < 1) { md_unregister_thread(mddev, &conf->thread); + if (!mddev->private) + raid1_free(mddev, conf); return -EINVAL; } if (conf->raid_disks - mddev->degraded == 1) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", mdname(mddev)); pr_info("md/raid1:%s: active with %d out of %d mirrors\n", @@ -3312,10 +3306,9 @@ static void raid1_free(struct mddev *mddev, void *priv) { struct r1conf *conf = priv; - mempool_exit(&conf->r1bio_pool); + mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); - kfree(conf->poolinfo); kfree(conf->nr_pending); kfree(conf->nr_waiting); kfree(conf->nr_queued); @@ -3334,20 +3327,22 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize = raid1_size(mddev, sectors, 0); - int ret; if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, newsize, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > mddev->dev_sectors) { - mddev->recovery_cp = mddev->dev_sectors; + mddev->resync_offset > mddev->dev_sectors) { + mddev->resync_offset = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->dev_sectors = sectors; @@ -3368,17 +3363,13 @@ static int raid1_reshape(struct mddev *mddev) * At the same time, we "pack" the devices so that all the missing * devices have the higher raid_disk numbers. */ - mempool_t newpool, oldpool; - struct pool_info *newpoolinfo; + mempool_t *newpool, *oldpool; + size_t new_r1bio_size; struct raid1_info *newmirrors; struct r1conf *conf = mddev->private; int cnt, raid_disks; unsigned long flags; int d, d2; - int ret; - - memset(&newpool, 0, sizeof(newpool)); - memset(&oldpool, 0, sizeof(oldpool)); /* Cannot change chunk_size, layout, or level */ if (mddev->chunk_sectors != mddev->new_chunk_sectors || @@ -3404,24 +3395,16 @@ static int raid1_reshape(struct mddev *mddev) return -EBUSY; } - newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); - if (!newpoolinfo) + new_r1bio_size = offsetof(struct r1bio, bios[raid_disks * 2]); + newpool = mempool_create_kmalloc_pool(NR_RAID_BIOS, new_r1bio_size); + if (!newpool) { return -ENOMEM; - newpoolinfo->mddev = mddev; - newpoolinfo->raid_disks = raid_disks * 2; - - ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, - rbio_pool_free, newpoolinfo); - if (ret) { - kfree(newpoolinfo); - return ret; } newmirrors = kzalloc(array3_size(sizeof(struct raid1_info), raid_disks, 2), GFP_KERNEL); if (!newmirrors) { - kfree(newpoolinfo); - mempool_exit(&newpool); + mempool_destroy(newpool); return -ENOMEM; } @@ -3446,8 +3429,6 @@ static int raid1_reshape(struct mddev *mddev) } kfree(conf->mirrors); conf->mirrors = newmirrors; - kfree(conf->poolinfo); - conf->poolinfo = newpoolinfo; spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded += (raid_disks - conf->raid_disks); @@ -3461,7 +3442,7 @@ static int raid1_reshape(struct mddev *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - mempool_exit(&oldpool); + mempool_destroy(oldpool); return 0; } @@ -3499,9 +3480,13 @@ static void *raid1_takeover(struct mddev *mddev) static struct md_personality raid1_personality = { - .name = "raid1", - .level = 1, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID1, + .name = "raid1", + .owner = THIS_MODULE, + }, + .make_request = raid1_make_request, .run = raid1_run, .free = raid1_free, @@ -3518,18 +3503,18 @@ static struct md_personality raid1_personality = .takeover = raid1_takeover, }; -static int __init raid_init(void) +static int __init raid1_init(void) { - return register_md_personality(&raid1_personality); + return register_md_submodule(&raid1_personality.head); } -static void raid_exit(void) +static void __exit raid1_exit(void) { - unregister_md_personality(&raid1_personality); + unregister_md_submodule(&raid1_personality.head); } -module_init(raid_init); -module_exit(raid_exit); +module_init(raid1_init); +module_exit(raid1_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); MODULE_ALIAS("md-personality-3"); /* RAID1 */ diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 5300cbaa58a4..2ebe35aaa534 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -49,22 +49,6 @@ struct raid1_info { sector_t seq_start; }; -/* - * memory pools need a pointer to the mddev, so they can force an unplug - * when memory is tight, and a count of the number of drives that the - * pool was allocated for, so they know how much to allocate and free. - * mddev->raid_disks cannot be used, as it can change while a pool is active - * These two datums are stored in a kmalloced struct. - * The 'raid_disks' here is twice the raid_disks in r1conf. - * This allows space for each 'real' device can have a replacement in the - * second half of the array. - */ - -struct pool_info { - struct mddev *mddev; - int raid_disks; -}; - struct r1conf { struct mddev *mddev; struct raid1_info *mirrors; /* twice 'raid_disks' to @@ -114,11 +98,7 @@ struct r1conf { */ int recovery_disabled; - /* poolinfo contains information about the content of the - * mempools - it changes when the array grows or shrinks - */ - struct pool_info *poolinfo; - mempool_t r1bio_pool; + mempool_t *r1bio_pool; mempool_t r1buf_pool; struct bio_set bio_split; @@ -188,7 +168,6 @@ struct r1bio { enum r1bio_state { R1BIO_Uptodate, R1BIO_IsSync, - R1BIO_Degraded, R1BIO_BehindIO, /* Set ReadError on bios that experience a readerror so that * raid1d knows what to do with them. @@ -199,7 +178,9 @@ enum r1bio_state { * any write was successful. Otherwise we call when * any write-behind write succeeds, otherwise we call * with failure when last write completes (and all failed). - * Record that bi_end_io was called with this flag... + * + * And for bio_split errors, record that bi_end_io was called + * with this flag... */ R1BIO_Returned, /* If a write for this request means we can clear some diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 18989231791a..84be4cc7e873 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -24,6 +24,7 @@ #include "raid10.h" #include "raid0.h" #include "md-bitmap.h" +#include "md-cluster.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -162,14 +163,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].bio = bio; if (!conf->have_replacement) continue; bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].repl_bio = bio; } /* @@ -321,10 +322,12 @@ static void raid_end_bio_io(struct r10bio *r10_bio) struct bio *bio = r10_bio->master_bio; struct r10conf *conf = r10_bio->mddev->private; - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - bio->bi_status = BLK_STS_IOERR; + if (!test_and_set_bit(R10BIO_Returned, &r10_bio->state)) { + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } - bio_endio(bio); /* * Wake up any possible resync thread that waits for the device * to go idle. @@ -398,6 +401,8 @@ static void raid10_end_read_request(struct bio *bio) * wait for the 'master' bio. */ set_bit(R10BIO_Uptodate, &r10_bio->state); + } else if (!raid1_should_handle_error(bio)) { + uptodate = 1; } else { /* If all other devices that store this block have * failed, we want to return the error upwards rather @@ -428,10 +433,6 @@ static void close_write(struct r10bio *r10_bio) { struct mddev *mddev = r10_bio->mddev; - /* clear the bitmap if all writes complete successfully */ - mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - false); md_write_end(mddev); } @@ -459,9 +460,8 @@ static void raid10_end_write_request(struct bio *bio) int slot, repl; struct md_rdev *rdev = NULL; struct bio *to_put = NULL; - bool discard_error; - - discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; + bool ignore_error = !raid1_should_handle_error(bio) || + (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); @@ -475,7 +475,7 @@ static void raid10_end_write_request(struct bio *bio) /* * this branch is our 'one mirror IO has finished' event handler: */ - if (bio->bi_status && !discard_error) { + if (bio->bi_status && !ignore_error) { if (repl) /* Never record new bad blocks to replacement, * just fail it. @@ -501,7 +501,6 @@ static void raid10_end_write_request(struct bio *bio) set_bit(R10BIO_WriteError, &r10_bio->state); else { /* Fail the request */ - set_bit(R10BIO_Degraded, &r10_bio->state); r10_bio->devs[slot].bio = NULL; to_put = bio; dec_rdev = 1; @@ -531,7 +530,7 @@ static void raid10_end_write_request(struct bio *bio) /* Maybe we can clear some bad blocks. */ if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr, r10_bio->sectors) && - !discard_error) { + !ignore_error) { bio_put(bio); if (repl) r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; @@ -752,7 +751,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, for (slot = 0; slot < conf->copies ; slot++) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; sector_t dev_sector; unsigned int pending; bool nonrot; @@ -1151,15 +1150,12 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, { struct r10conf *conf = mddev->private; struct bio *read_bio; - const enum req_op op = bio_op(bio); - const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; struct md_rdev *rdev; char b[BDEVNAME_SIZE]; int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - int error; if (slot >= 0 && r10_bio->devs[slot].rdev) { /* @@ -1187,8 +1183,11 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } } - if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) + if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) { + raid_end_bio_io(r10_bio); return; + } + rdev = read_balance(conf, r10_bio, &max_sectors); if (!rdev) { if (err_rdev) { @@ -1205,17 +1204,15 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, rdev->bdev, (unsigned long long)r10_bio->sector); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); - goto err_handle; - } - bio_chain(split, bio); allow_barrier(conf); - submit_bio_noacct(bio); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); wait_barrier(conf, false); - bio = split; + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); + goto err_handle; + } + r10_bio->master_bio = bio; r10_bio->sectors = max_sectors; } @@ -1226,6 +1223,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, r10_bio->master_bio = bio; } read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); + read_bio->bi_opf &= ~REQ_NOWAIT; r10_bio->devs[slot].bio = read_bio; r10_bio->devs[slot].rdev = rdev; @@ -1233,7 +1231,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + choose_data_offset(r10_bio, rdev); read_bio->bi_end_io = raid10_end_read_request; - read_bio->bi_opf = op | do_sync; if (test_bit(FailFast, &rdev->flags) && test_bit(R10BIO_FailFast, &r10_bio->state)) read_bio->bi_opf |= MD_FAILFAST; @@ -1243,8 +1240,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, return; err_handle: atomic_dec(&rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } @@ -1252,9 +1247,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, struct bio *bio, bool replacement, int n_copy) { - const enum req_op op = bio_op(bio); - const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; - const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; unsigned long flags; struct r10conf *conf = mddev->private; struct md_rdev *rdev; @@ -1265,6 +1257,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, conf->mirrors[devnum].rdev; mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); + mbio->bi_opf &= ~REQ_NOWAIT; if (replacement) r10_bio->devs[n_copy].repl_bio = mbio; else @@ -1273,7 +1266,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + choose_data_offset(r10_bio, rdev)); mbio->bi_end_io = raid10_end_write_request; - mbio->bi_opf = op | do_sync | do_fua; if (!replacement && test_bit(FailFast, &conf->mirrors[devnum].rdev->flags) && enough(conf, devnum)) @@ -1356,12 +1348,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int i, k; sector_t sectors; int max_sectors; - int error; if ((mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, - bio_end_sector(bio)))) { + mddev->cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) { DEFINE_WAIT(w); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) { @@ -1371,7 +1362,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, for (;;) { prepare_to_wait(&conf->wait_barrier, &w, TASK_IDLE); - if (!md_cluster_ops->area_resyncing(mddev, WRITE, + if (!mddev->cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) break; schedule(); @@ -1380,8 +1371,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, } sectors = r10_bio->sectors; - if (!regular_request_wait(mddev, conf, bio, sectors)) + if (!regular_request_wait(mddev, conf, bio, sectors)) { + raid_end_bio_io(r10_bio); return; + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && (mddev->reshape_backwards ? (bio->bi_iter.bi_sector < conf->reshape_safe && @@ -1437,14 +1431,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->devs[i].bio = NULL; r10_bio->devs[i].repl_bio = NULL; - if (!rdev && !rrdev) { - set_bit(R10BIO_Degraded, &r10_bio->state); + if (!rdev && !rrdev) continue; - } if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; - int bad_sectors; + sector_t bad_sectors; int is_bad; is_bad = is_badblock(rdev, dev_sector, max_sectors, @@ -1457,18 +1449,22 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * to other devices yet */ max_sectors = bad_sectors; - /* We don't set R10BIO_Degraded as that - * only applies if the disk is missing, - * so it might be re-added, and we want to - * know to recover this chunk. - * In this case the device is here, and the - * fact that this chunk is not in-sync is - * recorded in the bad block log. - */ continue; } if (is_bad) { - int good_sectors = first_bad - dev_sector; + int good_sectors; + + /* + * We cannot atomically write this, so just + * error in that case. It could be possible to + * atomically write other mirrors, but the + * complexity of supporting that is not worth + * the benefit. + */ + if (bio->bi_opf & REQ_ATOMIC) + goto err_handle; + + good_sectors = first_bad - dev_sector; if (good_sectors < max_sectors) max_sectors = good_sectors; } @@ -1487,25 +1483,21 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->sectors = max_sectors; if (r10_bio->sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, r10_bio->sectors, - GFP_NOIO, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); - goto err_handle; - } - bio_chain(split, bio); allow_barrier(conf); - submit_bio_noacct(bio); + bio = bio_submit_split_bioset(bio, r10_bio->sectors, + &conf->bio_split); wait_barrier(conf, false); - bio = split; + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); + goto err_handle; + } + r10_bio->master_bio = bio; } md_account_bio(mddev, &bio); r10_bio->master_bio = bio; atomic_set(&r10_bio->remaining, 1); - mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors, - false); for (i = 0; i < conf->copies; i++) { if (r10_bio->devs[i].bio) @@ -1531,8 +1523,6 @@ err_handle: } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } @@ -1633,11 +1623,10 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return -EAGAIN; - if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) { + if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) { bio_wouldblock_error(bio); return 0; } - wait_barrier(conf, false); /* * Check reshape again to avoid reshape happens after checking @@ -1680,7 +1669,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the fist split part */ submit_bio_noacct(split); @@ -1695,7 +1686,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the second split part */ submit_bio_noacct(bio); @@ -1745,6 +1738,7 @@ retry_discard: * The discard bio returns only first r10bio finishes */ if (first_copy) { + md_account_bio(mddev, &bio); r10_bio->master_bio = bio; set_bit(R10BIO_Discard, &r10_bio->state); first_copy = false; @@ -2117,7 +2111,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) int last = conf->geo.raid_disks - 1; struct raid10_info *p; - if (mddev->recovery_cp < MaxSector) + if (mddev->resync_offset < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ @@ -2435,7 +2429,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) tbio->bi_opf |= MD_FAILFAST; @@ -2447,18 +2440,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) * that are active */ for (i = 0; i < conf->copies; i++) { - int d; - tbio = r10_bio->devs[i].repl_bio; if (!tbio || !tbio->bi_end_io) continue; if (r10_bio->devs[i].bio->bi_end_io != end_sync_write && r10_bio->devs[i].bio != fbio) bio_copy_data(tbio, fbio); - d = r10_bio->devs[i].devnum; atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].replacement->bdev, - bio_sectors(tbio)); submit_bio_noacct(tbio); } @@ -2592,13 +2580,10 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) d = r10_bio->devs[1].devnum; if (wbio->bi_end_io) { atomic_inc(&conf->mirrors[d].rdev->nr_pending); - md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); submit_bio_noacct(wbio); } if (wbio2) { atomic_inc(&conf->mirrors[d].replacement->nr_pending); - md_sync_acct(conf->mirrors[d].replacement->bdev, - bio_sectors(wbio2)); submit_bio_noacct(wbio2); } } @@ -2788,7 +2773,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 } } -static int narrow_write_error(struct r10bio *r10_bio, int i) +static bool narrow_write_error(struct r10bio *r10_bio, int i) { struct bio *bio = r10_bio->master_bio; struct mddev *mddev = r10_bio->mddev; @@ -2809,10 +2794,10 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) sector_t sector; int sectors; int sect_to_write = r10_bio->sectors; - int ok = 1; + bool ok = true; if (rdev->badblocks.shift < 0) - return 0; + return false; block_sectors = roundup(1 << rdev->badblocks.shift, bdev_logical_block_size(rdev->bdev) >> 9); @@ -2951,11 +2936,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } else if (bio != NULL && bio->bi_status) { fail = true; - if (!narrow_write_error(r10_bio, m)) { + if (!narrow_write_error(r10_bio, m)) md_error(conf->mddev, rdev); - set_bit(R10BIO_Degraded, - &r10_bio->state); - } rdev_dec_pending(rdev, conf->mddev); } bio = r10_bio->devs[m].repl_bio; @@ -3014,8 +2996,6 @@ static void raid10d(struct md_thread *thread) r10_bio = list_first_entry(&tmp, struct r10bio, retry_list); list_del(&r10_bio->retry_list); - if (mddev->degraded) - set_bit(R10BIO_Degraded, &r10_bio->state); if (test_bit(R10BIO_WriteError, &r10_bio->state)) @@ -3199,7 +3179,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * of a clean array, like RAID1 does. */ if (mddev->bitmap == NULL && - mddev->recovery_cp == MaxSector && + mddev->resync_offset == MaxSector && mddev->reshape_position == MaxSector && !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && @@ -3235,15 +3215,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - mddev->bitmap_ops->end_sync(mddev, - mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); - mddev->bitmap_ops->end_sync(mddev, sect, - &sync_blocks); + md_bitmap_end_sync(mddev, sect, &sync_blocks); } } else { /* completed sync */ @@ -3263,7 +3241,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } conf->fullsync = 0; } - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); *skipped = 1; return sectors_skipped; @@ -3365,9 +3344,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * we only need to recover the block if it is set in * the bitmap */ - must_sync = mddev->bitmap_ops->start_sync(mddev, sect, - &sync_blocks, - true); + must_sync = md_bitmap_start_sync(mddev, sect, + &sync_blocks, true); if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && @@ -3410,9 +3388,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } } - must_sync = mddev->bitmap_ops->start_sync(mddev, sect, - &sync_blocks, still_degraded); - + md_bitmap_start_sync(mddev, sect, &sync_blocks, + still_degraded); any_working = 0; for (j=0; j<conf->copies;j++) { int k; @@ -3420,7 +3397,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t from_addr, to_addr; struct md_rdev *rdev = conf->mirrors[d].rdev; sector_t sector, first_bad; - int bad_sectors; + sector_t bad_sectors; if (!rdev || !test_bit(In_sync, &rdev->flags)) continue; @@ -3584,13 +3561,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * safety reason, which ensures curr_resync_completed is * updated in bitmap_cond_end_sync. */ - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, - &sync_blocks, - mddev->degraded) && + if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, + mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block */ @@ -3616,7 +3593,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; sector_t first_bad, sector; - int bad_sectors; + sector_t bad_sectors; struct md_rdev *rdev; if (r10_bio->devs[i].repl_bio) @@ -3723,7 +3700,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, conf->cluster_sync_low = mddev->curr_resync_completed; raid10_set_cluster_sync_high(conf); /* Send resync message */ - md_cluster_ops->resync_info_update(mddev, + mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, conf->cluster_sync_high); } @@ -3756,7 +3733,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } if (broadcast_msg) { raid10_set_cluster_sync_high(conf); - md_cluster_ops->resync_info_update(mddev, + mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, conf->cluster_sync_high); } @@ -3771,7 +3748,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->sectors = nr_sectors; if (bio->bi_end_io == end_sync_read) { - md_sync_acct_bio(bio, nr_sectors); bio->bi_status = 0; submit_bio_noacct(bio); } @@ -4023,13 +3999,15 @@ static int raid10_set_queue_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; + lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; + lim.chunk_sectors = mddev->chunk_sectors; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); + lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); - if (err) { - queue_limits_cancel_update(mddev->gendisk->queue); + if (err) return err; - } return queue_limits_set(mddev->gendisk->queue, &lim); } @@ -4160,7 +4138,7 @@ static int raid10_run(struct mddev *mddev) disk->recovery_disabled = mddev->recovery_disabled - 1; } - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", mdname(mddev)); pr_info("md/raid10:%s: active with %d out of %d devices\n", @@ -4240,7 +4218,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) */ struct r10conf *conf = mddev->private; sector_t oldsize, size; - int ret; if (mddev->reshape_position != MaxSector) return -EBUSY; @@ -4254,14 +4231,17 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > size) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, size, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, size, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, size); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > oldsize) { - mddev->recovery_cp = oldsize; + mddev->resync_offset > oldsize) { + mddev->resync_offset = oldsize; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } calc_sectors(conf, sectors); @@ -4290,7 +4270,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) mddev->delta_disks = mddev->raid_disks; mddev->raid_disks *= 2; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev->dev_sectors = size; conf = setup_conf(mddev); @@ -4522,8 +4502,9 @@ static int raid10_start_reshape(struct mddev *mddev) oldsize = raid10_size(mddev, 0, 0); newsize = raid10_size(mddev, 0, conf->geo.raid_disks); - if (!mddev_is_clustered(mddev)) { - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + if (!mddev_is_clustered(mddev) && + md_bitmap_enabled(mddev, false)) { + ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; else @@ -4545,13 +4526,14 @@ static int raid10_start_reshape(struct mddev *mddev) MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) goto out; - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + /* cluster can't be setup without bitmap */ + ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; - ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); + ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) { - mddev->bitmap_ops->resize(mddev, oldsize, 0, false); + mddev->bitmap_ops->resize(mddev, oldsize, 0); goto abort; } } @@ -4840,7 +4822,7 @@ read_more: conf->cluster_sync_low = sb_reshape_pos; } - md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, + mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, conf->cluster_sync_high); } @@ -4895,7 +4877,6 @@ read_more: r10_bio->sectors = nr_sectors; /* Now submit the read */ - md_sync_acct_bio(read_bio, r10_bio->sectors); atomic_inc(&r10_bio->remaining); read_bio->bi_next = NULL; submit_bio_noacct(read_bio); @@ -4955,7 +4936,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) continue; atomic_inc(&rdev->nr_pending); - md_sync_acct_bio(b, r10_bio->sectors); atomic_inc(&r10_bio->remaining); b->bi_next = NULL; submit_bio_noacct(b); @@ -4985,7 +4965,7 @@ static void raid10_update_reshape_pos(struct mddev *mddev) struct r10conf *conf = mddev->private; sector_t lo, hi; - md_cluster_ops->resync_info_get(mddev, &lo, &hi); + mddev->cluster_ops->resync_info_get(mddev, &lo, &hi); if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) || mddev->reshape_position == MaxSector) conf->reshape_progress = mddev->reshape_position; @@ -5104,8 +5084,8 @@ static void raid10_finish_reshape(struct mddev *mddev) return; if (mddev->delta_disks > 0) { - if (mddev->recovery_cp > mddev->resync_max_sectors) { - mddev->recovery_cp = mddev->resync_max_sectors; + if (mddev->resync_offset > mddev->resync_max_sectors) { + mddev->resync_offset = mddev->resync_max_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->resync_max_sectors = mddev->array_sectors; @@ -5131,9 +5111,13 @@ static void raid10_finish_reshape(struct mddev *mddev) static struct md_personality raid10_personality = { - .name = "raid10", - .level = 10, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID10, + .name = "raid10", + .owner = THIS_MODULE, + }, + .make_request = raid10_make_request, .run = raid10_run, .free = raid10_free, @@ -5153,18 +5137,18 @@ static struct md_personality raid10_personality = .update_reshape_pos = raid10_update_reshape_pos, }; -static int __init raid_init(void) +static int __init raid10_init(void) { - return register_md_personality(&raid10_personality); + return register_md_submodule(&raid10_personality.head); } -static void raid_exit(void) +static void __exit raid10_exit(void) { - unregister_md_personality(&raid10_personality); + unregister_md_submodule(&raid10_personality.head); } -module_init(raid_init); -module_exit(raid_exit); +module_init(raid10_init); +module_exit(raid10_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); MODULE_ALIAS("md-personality-9"); /* RAID10 */ diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 2e75e88d0802..da00a55f7a55 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -161,11 +161,12 @@ enum r10bio_state { R10BIO_IsSync, R10BIO_IsRecover, R10BIO_IsReshape, - R10BIO_Degraded, /* Set ReadError on bios that experience a read error * so that raid10d knows what to do with them. */ R10BIO_ReadError, +/* For bio_split errors, record that bi_end_io was called. */ + R10BIO_Returned, /* If a write for this request means we can clear some * known-bad-block records, we set this flag. */ diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index b4f7b79fd187..e29e69335c69 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -313,10 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, if (sh->dev[i].written) { set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); - conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - false); } } } @@ -718,7 +714,7 @@ static void r5l_submit_current_io(struct r5l_log *log) block = page_address(io->meta_page); block->meta_size = cpu_to_le32(io->meta_offset); - crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); + crc = crc32c(log->uuid_checksum, block, PAGE_SIZE); block->checksum = cpu_to_le32(crc); log->current_io = NULL; @@ -1023,10 +1019,10 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) /* checksum is already calculated in last run */ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) continue; - addr = kmap_atomic(sh->dev[i].page); - sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, - addr, PAGE_SIZE); - kunmap_atomic(addr); + addr = kmap_local_page(sh->dev[i].page); + sh->dev[i].log_checksum = crc32c(log->uuid_checksum, + addr, PAGE_SIZE); + kunmap_local(addr); } parity_pages = 1 + !!(sh->qd_idx >= 0); data_pages = write_disks - parity_pages; @@ -1745,7 +1741,7 @@ static int r5l_recovery_read_meta_block(struct r5l_log *log, le64_to_cpu(mb->position) != ctx->pos) return -EINVAL; - crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + crc = crc32c(log->uuid_checksum, mb, PAGE_SIZE); if (stored_crc != crc) return -EINVAL; @@ -1784,8 +1780,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, return -ENOMEM; r5l_recovery_create_empty_meta_block(log, page, pos, seq); mb = page_address(page); - mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, - mb, PAGE_SIZE)); + mb->checksum = cpu_to_le32(crc32c(log->uuid_checksum, mb, PAGE_SIZE)); if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false)) { __free_page(page); @@ -1979,9 +1974,9 @@ r5l_recovery_verify_data_checksum(struct r5l_log *log, u32 checksum; r5l_recovery_read_page(log, ctx, page, log_offset); - addr = kmap_atomic(page); - checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); - kunmap_atomic(addr); + addr = kmap_local_page(page); + checksum = crc32c(log->uuid_checksum, addr, PAGE_SIZE); + kunmap_local(addr); return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; } @@ -2381,11 +2376,11 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, payload->size = cpu_to_le32(BLOCK_SECTORS); payload->location = cpu_to_le64( raid5_compute_blocknr(sh, i, 0)); - addr = kmap_atomic(dev->page); + addr = kmap_local_page(dev->page); payload->checksum[0] = cpu_to_le32( - crc32c_le(log->uuid_checksum, addr, - PAGE_SIZE)); - kunmap_atomic(addr); + crc32c(log->uuid_checksum, addr, + PAGE_SIZE)); + kunmap_local(addr); sync_page_io(log->rdev, write_pos, PAGE_SIZE, dev->page, REQ_OP_WRITE, false); write_pos = r5l_ring_add(log, write_pos, @@ -2396,8 +2391,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, } } mb->meta_size = cpu_to_le32(offset); - mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, - mb, PAGE_SIZE)); + mb->checksum = cpu_to_le32(crc32c(log->uuid_checksum, + mb, PAGE_SIZE)); sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false); sh->log_start = ctx->pos; @@ -2888,10 +2883,10 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) continue; - addr = kmap_atomic(sh->dev[i].page); - sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, - addr, PAGE_SIZE); - kunmap_atomic(addr); + addr = kmap_local_page(sh->dev[i].page); + sh->dev[i].log_checksum = crc32c(log->uuid_checksum, + addr, PAGE_SIZE); + kunmap_local(addr); pages++; } WARN_ON(pages == 0); @@ -2973,7 +2968,7 @@ static int r5l_load_log(struct r5l_log *log) } stored_crc = le32_to_cpu(mb->checksum); mb->checksum = 0; - expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + expected_crc = crc32c(log->uuid_checksum, mb, PAGE_SIZE); if (stored_crc != expected_crc) { create_super = true; goto create; @@ -3081,8 +3076,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) return -ENOMEM; log->rdev = rdev; log->need_cache_flush = bdev_write_cache(rdev->bdev); - log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, - sizeof(rdev->mddev->uuid)); + log->uuid_checksum = crc32c(~0, rdev->mddev->uuid, + sizeof(rdev->mddev->uuid)); mutex_init(&log->io_mutex); @@ -3109,7 +3104,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) goto out_mempool; spin_lock_init(&log->tree_lock); - INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); + INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT); thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 37c4da5311ca..56b234683ee6 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -346,9 +346,9 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) { le32_add_cpu(&e->pp_size, PAGE_SIZE); io->pp_size += PAGE_SIZE; - e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum), - page_address(sh->ppl_page), - PAGE_SIZE)); + e->checksum = cpu_to_le32(crc32c(le32_to_cpu(e->checksum), + page_address(sh->ppl_page), + PAGE_SIZE)); } list_add_tail(&sh->log_list, &io->stripe_list); @@ -454,7 +454,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) } pplhdr->entries_count = cpu_to_le32(io->entries_count); - pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE)); + pplhdr->checksum = cpu_to_le32(~crc32c(~0, pplhdr, PPL_HEADER_SIZE)); /* Rewind the buffer if current PPL is larger then remaining space */ if (log->use_multippl && @@ -998,7 +998,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, goto out; } - crc = crc32c_le(crc, page_address(page), s); + crc = crc32c(crc, page_address(page), s); pp_size -= s; sector += s >> 9; @@ -1052,7 +1052,7 @@ static int ppl_write_empty_header(struct ppl_log *log) log->rdev->ppl.size, GFP_NOIO, 0); memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); pplhdr->signature = cpu_to_le32(log->ppl_conf->signature); - pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE)); + pplhdr->checksum = cpu_to_le32(~crc32c(~0, pplhdr, PAGE_SIZE)); if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset, PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC | @@ -1106,7 +1106,7 @@ static int ppl_load_distributed(struct ppl_log *log) /* check header validity */ crc_stored = le32_to_cpu(pplhdr->checksum); pplhdr->checksum = 0; - crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE); + crc = ~crc32c(~0, pplhdr, PAGE_SIZE); if (crc_stored != crc) { pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n", @@ -1163,7 +1163,7 @@ static int ppl_load_distributed(struct ppl_log *log) le64_to_cpu(pplhdr->generation)); /* attempt to recover from log if we are starting a dirty array */ - if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector) + if (pplhdr && !mddev->pers && mddev->resync_offset != MaxSector) ret = ppl_recover(log, pplhdr, pplhdr_offset); /* write empty header if we are starting the array */ @@ -1390,7 +1390,7 @@ int ppl_init_log(struct r5conf *conf) spin_lock_init(&ppl_conf->no_mem_stripes_lock); if (!mddev->external) { - ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); + ppl_conf->signature = ~crc32c(~0, mddev->uuid, sizeof(mddev->uuid)); ppl_conf->block_size = 512; } else { ppl_conf->block_size = @@ -1422,14 +1422,14 @@ int ppl_init_log(struct r5conf *conf) if (ret) { goto err; - } else if (!mddev->pers && mddev->recovery_cp == 0 && + } else if (!mddev->pers && mddev->resync_offset == 0 && ppl_conf->recovered_entries > 0 && ppl_conf->mismatch_count == 0) { /* * If we are starting a dirty array and the recovery succeeds * without any issues, set the array as clean. */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } else if (mddev->pers && ppl_conf->mismatch_count > 0) { /* no mismatch allowed when enabling PPL for a running array */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f09e7677ee9f..e57ce3295292 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -906,8 +906,7 @@ static bool stripe_can_batch(struct stripe_head *sh) if (raid5_has_log(conf) || raid5_has_ppl(conf)) return false; return test_bit(STRIPE_BATCH_READY, &sh->state) && - !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && - is_full_stripe_write(sh); + is_full_stripe_write(sh); } /* we only do back search */ @@ -1241,10 +1240,6 @@ again: } if (rdev) { - if (s->syncing || s->expanding || s->expanded - || s->replacing) - md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf)); - set_bit(STRIPE_IO_STARTED, &sh->state); bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags); @@ -1301,10 +1296,6 @@ again: submit_bio_noacct(bi); } if (rrdev) { - if (s->syncing || s->expanding || s->expanded - || s->replacing) - md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf)); - set_bit(STRIPE_IO_STARTED, &sh->state); bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags); @@ -1345,8 +1336,6 @@ again: submit_bio_noacct(rbi); } if (!rdev && !rrdev) { - if (op_is_write(op)) - set_bit(STRIPE_DEGRADED, &sh->state); pr_debug("skip op %d on disc %d for sector %llu\n", bi->bi_opf, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); @@ -2884,7 +2873,6 @@ static void raid5_end_write_request(struct bio *bi) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { if (bi->bi_status) { - set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) @@ -3548,29 +3536,9 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, (*bip)->bi_iter.bi_sector, sh->sector, dd_idx, sh->dev[dd_idx].sector); - if (conf->mddev->bitmap && firstwrite) { - /* Cannot hold spinlock over bitmap_startwrite, - * but must ensure this isn't added to a batch until - * we have added to the bitmap and set bm_seq. - * So set STRIPE_BITMAP_PENDING to prevent - * batching. - * If multiple __add_stripe_bio() calls race here they - * much all set STRIPE_BITMAP_PENDING. So only the first one - * to complete "bitmap_startwrite" gets to set - * STRIPE_BIT_DELAY. This is important as once a stripe - * is added to a batch, STRIPE_BIT_DELAY cannot be changed - * any more. - */ - set_bit(STRIPE_BITMAP_PENDING, &sh->state); - spin_unlock_irq(&sh->stripe_lock); - conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector, - RAID5_STRIPE_SECTORS(conf), false); - spin_lock_irq(&sh->stripe_lock); - clear_bit(STRIPE_BITMAP_PENDING, &sh->state); - if (!sh->batch_head) { - sh->bm_seq = conf->seq_flush+1; - set_bit(STRIPE_BIT_DELAY, &sh->state); - } + if (conf->mddev->bitmap && firstwrite && !sh->batch_head) { + sh->bm_seq = conf->seq_flush+1; + set_bit(STRIPE_BIT_DELAY, &sh->state); } } @@ -3621,7 +3589,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, BUG_ON(sh->batch_head); for (i = disks; i--; ) { struct bio *bi; - int bitmap_end = 0; if (test_bit(R5_ReadError, &sh->dev[i].flags)) { struct md_rdev *rdev = conf->disks[i].rdev; @@ -3646,8 +3613,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].towrite = NULL; sh->overwrite_disks = 0; spin_unlock_irq(&sh->stripe_lock); - if (bi) - bitmap_end = 1; log_stripe_write_finished(sh); @@ -3662,11 +3627,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bio_io_error(bi); bi = nextbi; } - if (bitmap_end) - conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf), - false, false); - bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; @@ -3675,7 +3635,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].page = sh->dev[i].orig_page; } - if (bi) bitmap_end = 1; while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector); @@ -3709,10 +3668,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi = nextbi; } } - if (bitmap_end) - conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf), - false, false); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -3785,7 +3740,7 @@ static int want_replace(struct stripe_head *sh, int disk_idx) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && (rdev->recovery_offset <= sh->sector - || rdev->mddev->recovery_cp <= sh->sector)) + || rdev->mddev->resync_offset <= sh->sector)) rv = 1; return rv; } @@ -3877,7 +3832,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, * is missing/faulty, then we need to read everything we can. */ if (!force_rcw && - sh->sector < sh->raid_conf->mddev->recovery_cp) + sh->sector < sh->raid_conf->mddev->resync_offset) /* reconstruct-write isn't being forced */ return 0; for (i = 0; i < s->failed && i < 2; i++) { @@ -4061,10 +4016,7 @@ returnbi: bio_endio(wbi); wbi = wbi2; } - conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - false); + if (head_sh->batch_head) { sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -4145,7 +4097,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - sector_t recovery_cp = conf->mddev->recovery_cp; + struct mddev *mddev = conf->mddev; + sector_t resync_offset = mddev->resync_offset; /* Check whether resync is now happening or should start. * If yes, then the array is dirty (after unclean shutdown or @@ -4155,15 +4108,21 @@ static int handle_stripe_dirtying(struct r5conf *conf, * generate correct data from the parity. */ if (conf->rmw_level == PARITY_DISABLE_RMW || - (recovery_cp < MaxSector && sh->sector >= recovery_cp && + (resync_offset < MaxSector && sh->sector >= resync_offset && s->failed == 0)) { /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; - pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", - conf->rmw_level, (unsigned long long)recovery_cp, + pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n", + conf->rmw_level, (unsigned long long)resync_offset, (unsigned long long)sh->sector); + } else if (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced && + !mddev->bitmap_ops->blocks_synced(mddev, sh->sector)) { + /* The initial recover is not done, must read everything */ + rcw = 1; rmw = 2; + pr_debug("force RCW by lazy recovery, sh->sector=%llu\n", + sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; @@ -4196,7 +4155,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ - mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d", + mddev_add_trace_msg(mddev, "raid5 rmw %llu %d", sh->sector, rmw); for (i = disks; i--; ) { @@ -4275,8 +4234,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_DELAYED, &sh->state); } } - if (rcw && !mddev_is_dm(conf->mddev)) - blk_add_trace_msg(conf->mddev->gendisk->queue, + if (rcw && !mddev_is_dm(mddev)) + blk_add_trace_msg(mddev->gendisk->queue, "raid5 rcw %llu %d %d %d", (unsigned long long)sh->sector, rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); @@ -4341,7 +4300,6 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, s->locked++; set_bit(R5_Wantwrite, &dev->flags); - clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_INSYNC, &sh->state); break; case check_state_run: @@ -4498,7 +4456,6 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, clear_bit(R5_Wantwrite, &dev->flags); s->locked--; } - clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_INSYNC, &sh->state); break; @@ -4748,10 +4705,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset) - /* in sync if before recovery_offset */ - set_bit(R5_Insync, &dev->flags); - else if (test_bit(R5_UPTODATE, &dev->flags) && + else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= + rdev->recovery_offset) { + /* + * in sync if: + * - normal IO, or + * - resync IO that is not lazy recovery + * + * For lazy recovery, we have to mark the rdev without + * In_sync as failed, to build initial xor data. + */ + if (!test_bit(STRIPE_SYNCING, &sh->state) || + !test_bit(MD_RECOVERY_LAZY_RECOVER, + &conf->mddev->recovery)) + set_bit(R5_Insync, &dev->flags); + } else if (test_bit(R5_UPTODATE, &dev->flags) && test_bit(R5_Expanded, &dev->flags)) /* If we've reshaped into here, we assume it is Insync. * We will shortly update recovery_offset to make @@ -4820,14 +4788,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(STRIPE_SYNCING, &sh->state)) { /* If there is a failed device being replaced, * we must be recovering. - * else if we are after recovery_cp, we must be syncing + * else if we are after resync_offset, we must be syncing * else if MD_RECOVERY_REQUESTED is set, we also are syncing. * else we can only be replacing * sync and recovery both need to read all devices, and so * use the same flag. */ if (do_recovery || - sh->sector >= conf->mddev->recovery_cp || + sh->sector >= conf->mddev->resync_offset || test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) s->syncing = 1; else @@ -4891,8 +4859,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, (1 << STRIPE_COMPUTE_RUN) | (1 << STRIPE_DISCARD) | (1 << STRIPE_BATCH_READY) | - (1 << STRIPE_BATCH_ERR) | - (1 << STRIPE_BITMAP_PENDING)), + (1 << STRIPE_BATCH_ERR)), "stripe state: %lx\n", sh->state); WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | (1 << STRIPE_REPLACED)), @@ -4900,7 +4867,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED) | (1 << STRIPE_ON_UNPLUG_LIST)), head_sh->state & (1 << STRIPE_INSYNC)); @@ -4990,7 +4956,8 @@ static void handle_stripe(struct stripe_head *sh) goto finish; if (s.handle_bad_blocks || - test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { + (md_is_rdwr(conf->mddev) && + test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags))) { set_bit(STRIPE_HANDLE, &sh->state); goto finish; } @@ -5520,17 +5487,17 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) { - struct bio *split; sector_t sector = raid_bio->bi_iter.bi_sector; unsigned chunk_sects = mddev->chunk_sectors; unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; - split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); - bio_chain(split, raid_bio); - submit_bio_noacct(raid_bio); - raid_bio = split; + + raid_bio = bio_submit_split_bioset(raid_bio, sectors, + &conf->bio_split); + if (!raid_bio) + return NULL; } if (!raid5_read_one_chunk(mddev, raid_bio)) @@ -5784,10 +5751,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) } spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap) { - for (d = 0; d < conf->raid_disks - conf->max_degraded; - d++) - mddev->bitmap_ops->startwrite(mddev, sh->sector, - RAID5_STRIPE_SECTORS(conf), false); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); } @@ -5906,6 +5869,9 @@ static enum reshape_loc get_reshape_loc(struct mddev *mddev, struct r5conf *conf, sector_t logical_sector) { sector_t reshape_progress, reshape_safe; + + if (likely(conf->reshape_progress == MaxSector)) + return LOC_NO_RESHAPE; /* * Spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be @@ -5928,6 +5894,54 @@ static enum reshape_loc get_reshape_loc(struct mddev *mddev, return LOC_BEHIND_RESHAPE; } +static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset, + unsigned long *sectors) +{ + struct r5conf *conf = mddev->private; + sector_t start = *offset; + sector_t end = start + *sectors; + sector_t prev_start = start; + sector_t prev_end = end; + int sectors_per_chunk; + enum reshape_loc loc; + int dd_idx; + + sectors_per_chunk = conf->chunk_sectors * + (conf->raid_disks - conf->max_degraded); + start = round_down(start, sectors_per_chunk); + end = round_up(end, sectors_per_chunk); + + start = raid5_compute_sector(conf, start, 0, &dd_idx, NULL); + end = raid5_compute_sector(conf, end, 0, &dd_idx, NULL); + + /* + * For LOC_INSIDE_RESHAPE, this IO will wait for reshape to make + * progress, hence it's the same as LOC_BEHIND_RESHAPE. + */ + loc = get_reshape_loc(mddev, conf, prev_start); + if (likely(loc != LOC_AHEAD_OF_RESHAPE)) { + *offset = start; + *sectors = end - start; + return; + } + + sectors_per_chunk = conf->prev_chunk_sectors * + (conf->previous_raid_disks - conf->max_degraded); + prev_start = round_down(prev_start, sectors_per_chunk); + prev_end = round_down(prev_end, sectors_per_chunk); + + prev_start = raid5_compute_sector(conf, prev_start, 1, &dd_idx, NULL); + prev_end = raid5_compute_sector(conf, prev_end, 1, &dd_idx, NULL); + + /* + * for LOC_AHEAD_OF_RESHAPE, reshape can make progress before this IO + * is handled in make_stripe_request(), we can't know this here hence + * we set bits for both. + */ + *offset = min(start, prev_start); + *sectors = max(end, prev_end) - *offset; +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -5935,22 +5949,19 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, const int rw = bio_data_dir(bi); enum stripe_result ret; struct stripe_head *sh; + enum reshape_loc loc; sector_t new_sector; int previous = 0, flags = 0; int seq, dd_idx; seq = read_seqcount_begin(&conf->gen_lock); - - if (unlikely(conf->reshape_progress != MaxSector)) { - enum reshape_loc loc = get_reshape_loc(mddev, conf, - logical_sector); - if (loc == LOC_INSIDE_RESHAPE) { - ret = STRIPE_SCHEDULE_AND_RETRY; - goto out; - } - if (loc == LOC_AHEAD_OF_RESHAPE) - previous = 1; + loc = get_reshape_loc(mddev, conf, logical_sector); + if (loc == LOC_INSIDE_RESHAPE) { + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } + if (loc == LOC_AHEAD_OF_RESHAPE) + previous = 1; new_sector = raid5_compute_sector(conf, logical_sector, previous, &dd_idx, NULL); @@ -6127,7 +6138,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && - (conf->reshape_progress != MaxSector) && get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) { bio_wouldblock_error(bi); if (rw == WRITE) @@ -6501,11 +6511,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (mddev->curr_resync < max_sector) /* aborted */ - mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); return 0; } @@ -6534,8 +6545,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && - !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, - true) && + !md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) && sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); @@ -6544,7 +6554,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n return sync_blocks * RAID5_STRIPE_SECTORS(conf); } - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); sh = raid5_get_active_stripe(conf, NULL, sector_nr, R5_GAS_NOBLOCK); @@ -6566,9 +6577,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n still_degraded = true; } - mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, - still_degraded); - + md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); @@ -6760,7 +6769,8 @@ static void raid5d(struct md_thread *thread) int batch_size, released; unsigned int offset; - if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + if (md_is_rdwr(mddev) && + test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) break; released = release_stripe_list(conf, conf->temp_inactive_list); @@ -6772,7 +6782,8 @@ static void raid5d(struct md_thread *thread) /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; spin_unlock_irq(&conf->device_lock); - mddev->bitmap_ops->unplug(mddev, true); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->unplug(mddev, true); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; activate_bit_delay(conf, conf->temp_inactive_list); @@ -7736,11 +7747,13 @@ static int raid5_set_limits(struct mddev *mddev) stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9)); md_init_stacking_limits(&lim); + lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; mddev_stack_rdev_limits(mddev, &lim, 0); rdev_for_each(rdev, mddev) queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, @@ -7789,7 +7802,7 @@ static int raid5_run(struct mddev *mddev) int first = 1; int ret = -EIO; - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", mdname(mddev)); @@ -7930,7 +7943,7 @@ static int raid5_run(struct mddev *mddev) mdname(mddev)); mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); - } else if (mddev->recovery_cp == MaxSector) + } else if (mddev->resync_offset == MaxSector) set_bit(MD_JOURNAL_CLEAN, &mddev->flags); } @@ -7997,7 +8010,7 @@ static int raid5_run(struct mddev *mddev) mddev->resync_max_sectors = mddev->dev_sectors; if (mddev->degraded > dirty_parity_disks && - mddev->recovery_cp != MaxSector) { + mddev->resync_offset != MaxSector) { if (test_bit(MD_HAS_PPL, &mddev->flags)) pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", mdname(mddev)); @@ -8321,7 +8334,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) */ sector_t newsize; struct r5conf *conf = mddev->private; - int ret; if (raid5_has_log(conf) || raid5_has_ppl(conf)) return -EINVAL; @@ -8331,14 +8343,17 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, sectors, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > mddev->dev_sectors) { - mddev->recovery_cp = mddev->dev_sectors; + mddev->resync_offset > mddev->dev_sectors) { + mddev->resync_offset = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->dev_sectors = sectors; @@ -8432,7 +8447,7 @@ static int raid5_start_reshape(struct mddev *mddev) return -EINVAL; /* raid5 can't handle concurrent reshape and recovery */ - if (mddev->recovery_cp < MaxSector) + if (mddev->resync_offset < MaxSector) return -EBUSY; for (i = 0; i < conf->raid_disks; i++) if (conf->disks[i].replacement) @@ -8657,7 +8672,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) mddev->raid_disks += 1; mddev->delta_disks = 1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; return setup_conf(mddev); } @@ -8954,9 +8969,13 @@ static void raid5_prepare_suspend(struct mddev *mddev) static struct md_personality raid6_personality = { - .name = "raid6", - .level = 6, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID6, + .name = "raid6", + .owner = THIS_MODULE, + }, + .make_request = raid5_make_request, .run = raid5_run, .start = raid5_start, @@ -8976,12 +8995,17 @@ static struct md_personality raid6_personality = .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, .prepare_suspend = raid5_prepare_suspend, + .bitmap_sector = raid5_bitmap_sector, }; static struct md_personality raid5_personality = { - .name = "raid5", - .level = 5, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID5, + .name = "raid5", + .owner = THIS_MODULE, + }, + .make_request = raid5_make_request, .run = raid5_run, .start = raid5_start, @@ -9001,13 +9025,18 @@ static struct md_personality raid5_personality = .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, .prepare_suspend = raid5_prepare_suspend, + .bitmap_sector = raid5_bitmap_sector, }; static struct md_personality raid4_personality = { - .name = "raid4", - .level = 4, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID4, + .name = "raid4", + .owner = THIS_MODULE, + }, + .make_request = raid5_make_request, .run = raid5_run, .start = raid5_start, @@ -9027,6 +9056,7 @@ static struct md_personality raid4_personality = .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, .prepare_suspend = raid5_prepare_suspend, + .bitmap_sector = raid5_bitmap_sector, }; static int __init raid5_init(void) @@ -9034,7 +9064,7 @@ static int __init raid5_init(void) int ret; raid5_wq = alloc_workqueue("raid5wq", - WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); + WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_SYSFS, 0); if (!raid5_wq) return -ENOMEM; @@ -9042,21 +9072,39 @@ static int __init raid5_init(void) "md/raid5:prepare", raid456_cpu_up_prepare, raid456_cpu_dead); - if (ret) { - destroy_workqueue(raid5_wq); - return ret; - } - register_md_personality(&raid6_personality); - register_md_personality(&raid5_personality); - register_md_personality(&raid4_personality); + if (ret) + goto err_destroy_wq; + + ret = register_md_submodule(&raid6_personality.head); + if (ret) + goto err_cpuhp_remove; + + ret = register_md_submodule(&raid5_personality.head); + if (ret) + goto err_unregister_raid6; + + ret = register_md_submodule(&raid4_personality.head); + if (ret) + goto err_unregister_raid5; + return 0; + +err_unregister_raid5: + unregister_md_submodule(&raid5_personality.head); +err_unregister_raid6: + unregister_md_submodule(&raid6_personality.head); +err_cpuhp_remove: + cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); +err_destroy_wq: + destroy_workqueue(raid5_wq); + return ret; } -static void raid5_exit(void) +static void __exit raid5_exit(void) { - unregister_md_personality(&raid6_personality); - unregister_md_personality(&raid5_personality); - unregister_md_personality(&raid4_personality); + unregister_md_submodule(&raid6_personality.head); + unregister_md_submodule(&raid5_personality.head); + unregister_md_submodule(&raid4_personality.head); cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); destroy_workqueue(raid5_wq); } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index d174e586698f..eafc6e9ed6ee 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -358,7 +358,6 @@ enum { STRIPE_REPLACED, STRIPE_PREREAD_ACTIVE, STRIPE_DELAYED, - STRIPE_DEGRADED, STRIPE_BIT_DELAY, STRIPE_EXPANDING, STRIPE_EXPAND_SOURCE, @@ -372,9 +371,6 @@ enum { STRIPE_ON_RELEASE_LIST, STRIPE_BATCH_READY, STRIPE_BATCH_ERR, - STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add - * to batch yet. - */ STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c) * this bit is used in two scenarios: * |
