summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig47
-rw-r--r--drivers/md/Makefile7
-rw-r--r--drivers/md/bcache/Kconfig1
-rw-r--r--drivers/md/bcache/alloc.c82
-rw-r--r--drivers/md/bcache/bcache.h8
-rw-r--r--drivers/md/bcache/bset.c116
-rw-r--r--drivers/md/bcache/bset.h44
-rw-r--r--drivers/md/bcache/btree.c127
-rw-r--r--drivers/md/bcache/debug.c3
-rw-r--r--drivers/md/bcache/extents.c45
-rw-r--r--drivers/md/bcache/io.c3
-rw-r--r--drivers/md/bcache/journal.c93
-rw-r--r--drivers/md/bcache/journal.h13
-rw-r--r--drivers/md/bcache/movinggc.c43
-rw-r--r--drivers/md/bcache/stats.c4
-rw-r--r--drivers/md/bcache/super.c123
-rw-r--r--drivers/md/bcache/sysfs.c19
-rw-r--r--drivers/md/bcache/util.h67
-rw-r--r--drivers/md/bcache/writeback.c28
-rw-r--r--drivers/md/dm-bufio.c220
-rw-r--r--drivers/md/dm-cache-policy-smq.c2
-rw-r--r--drivers/md/dm-cache-target.c96
-rw-r--r--drivers/md/dm-core.h6
-rw-r--r--drivers/md/dm-crypt.c147
-rw-r--r--drivers/md/dm-delay.c37
-rw-r--r--drivers/md/dm-dust.c4
-rw-r--r--drivers/md/dm-ebs-target.c12
-rw-r--r--drivers/md/dm-flakey.c123
-rw-r--r--drivers/md/dm-ima.c110
-rw-r--r--drivers/md/dm-integrity.c477
-rw-r--r--drivers/md/dm-io.c1
-rw-r--r--drivers/md/dm-ioctl.c1
-rw-r--r--drivers/md/dm-linear.c11
-rw-r--r--drivers/md/dm-log-writes.c8
-rw-r--r--drivers/md/dm-mpath.c247
-rw-r--r--drivers/md/dm-path-selector.c8
-rw-r--r--drivers/md/dm-path-selector.h2
-rw-r--r--drivers/md/dm-pcache/Kconfig17
-rw-r--r--drivers/md/dm-pcache/Makefile3
-rw-r--r--drivers/md/dm-pcache/backing_dev.c374
-rw-r--r--drivers/md/dm-pcache/backing_dev.h127
-rw-r--r--drivers/md/dm-pcache/cache.c445
-rw-r--r--drivers/md/dm-pcache/cache.h635
-rw-r--r--drivers/md/dm-pcache/cache_dev.c303
-rw-r--r--drivers/md/dm-pcache/cache_dev.h70
-rw-r--r--drivers/md/dm-pcache/cache_gc.c170
-rw-r--r--drivers/md/dm-pcache/cache_key.c888
-rw-r--r--drivers/md/dm-pcache/cache_req.c836
-rw-r--r--drivers/md/dm-pcache/cache_segment.c305
-rw-r--r--drivers/md/dm-pcache/cache_writeback.c261
-rw-r--r--drivers/md/dm-pcache/dm_pcache.c497
-rw-r--r--drivers/md/dm-pcache/dm_pcache.h67
-rw-r--r--drivers/md/dm-pcache/pcache_internal.h117
-rw-r--r--drivers/md/dm-pcache/segment.c61
-rw-r--r--drivers/md/dm-pcache/segment.h74
-rw-r--r--drivers/md/dm-ps-historical-service-time.c9
-rw-r--r--drivers/md/dm-ps-io-affinity.c7
-rw-r--r--drivers/md/dm-ps-queue-length.c9
-rw-r--r--drivers/md/dm-ps-round-robin.c9
-rw-r--r--drivers/md/dm-ps-service-time.c9
-rw-r--r--drivers/md/dm-raid.c95
-rw-r--r--drivers/md/dm-raid1.c14
-rw-r--r--drivers/md/dm-region-hash.c2
-rw-r--r--drivers/md/dm-rq.c6
-rw-r--r--drivers/md/dm-stripe.c23
-rw-r--r--drivers/md/dm-switch.c8
-rw-r--r--drivers/md/dm-table.c328
-rw-r--r--drivers/md/dm-target.c5
-rw-r--r--drivers/md/dm-thin.c16
-rw-r--r--drivers/md/dm-vdo/block-map.c13
-rw-r--r--drivers/md/dm-vdo/constants.h3
-rw-r--r--drivers/md/dm-vdo/data-vio.c17
-rw-r--r--drivers/md/dm-vdo/dedupe.c25
-rw-r--r--drivers/md/dm-vdo/encodings.c20
-rw-r--r--drivers/md/dm-vdo/funnel-workqueue.c3
-rw-r--r--drivers/md/dm-vdo/indexer/index-layout.c5
-rw-r--r--drivers/md/dm-vdo/indexer/index-session.c6
-rw-r--r--drivers/md/dm-vdo/indexer/indexer.h53
-rw-r--r--drivers/md/dm-vdo/indexer/volume-index.c4
-rw-r--r--drivers/md/dm-vdo/indexer/volume.c24
-rw-r--r--drivers/md/dm-vdo/io-submitter.c6
-rw-r--r--drivers/md/dm-vdo/io-submitter.h18
-rw-r--r--drivers/md/dm-vdo/logger.c2
-rw-r--r--drivers/md/dm-vdo/packer.h2
-rw-r--r--drivers/md/dm-vdo/priority-table.c2
-rw-r--r--drivers/md/dm-vdo/recovery-journal.h6
-rw-r--r--drivers/md/dm-vdo/slab-depot.c193
-rw-r--r--drivers/md/dm-vdo/slab-depot.h13
-rw-r--r--drivers/md/dm-vdo/types.h3
-rw-r--r--drivers/md/dm-vdo/vdo.c11
-rw-r--r--drivers/md/dm-vdo/vio.c56
-rw-r--r--drivers/md/dm-vdo/vio.h13
-rw-r--r--drivers/md/dm-vdo/wait-queue.c2
-rw-r--r--drivers/md/dm-verity-fec.c75
-rw-r--r--drivers/md/dm-verity-target.c262
-rw-r--r--drivers/md/dm-verity-verify-sig.c17
-rw-r--r--drivers/md/dm-verity.h22
-rw-r--r--drivers/md/dm-writecache.c21
-rw-r--r--drivers/md/dm-zone.c157
-rw-r--r--drivers/md/dm-zoned-reclaim.c6
-rw-r--r--drivers/md/dm-zoned-target.c5
-rw-r--r--drivers/md/dm.c228
-rw-r--r--drivers/md/dm.h9
-rw-r--r--drivers/md/md-autodetect.c8
-rw-r--r--drivers/md/md-bitmap.c249
-rw-r--r--drivers/md/md-bitmap.h119
-rw-r--r--drivers/md/md-cluster.c40
-rw-r--r--drivers/md/md-cluster.h6
-rw-r--r--drivers/md/md-linear.c350
-rw-r--r--drivers/md/md-llbitmap.c1626
-rw-r--r--drivers/md/md.c1420
-rw-r--r--drivers/md/md.h141
-rw-r--r--drivers/md/persistent-data/Kconfig2
-rw-r--r--drivers/md/persistent-data/dm-array.c19
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c54
-rw-r--r--drivers/md/raid0.c81
-rw-r--r--drivers/md/raid1-10.c20
-rw-r--r--drivers/md/raid1.c367
-rw-r--r--drivers/md/raid1.h27
-rw-r--r--drivers/md/raid10.c268
-rw-r--r--drivers/md/raid10.h3
-rw-r--r--drivers/md/raid5-cache.c53
-rw-r--r--drivers/md/raid5-ppl.c22
-rw-r--r--drivers/md/raid5.c322
-rw-r--r--drivers/md/raid5.h4
125 files changed, 11663 insertions, 3015 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 1e9db8e4acdf..104aa5355090 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -37,6 +37,32 @@ config BLK_DEV_MD
If unsure, say N.
+config MD_BITMAP
+ bool "MD RAID bitmap support"
+ default y
+ depends on BLK_DEV_MD
+ help
+ If you say Y here, support for the write intent bitmap will be
+ enabled. The bitmap can be used to optimize resync speed after power
+ failure or readding a disk, limiting it to recorded dirty sectors in
+ bitmap.
+
+ This feature can be added to existing MD array or MD array can be
+ created with bitmap via mdadm(8).
+
+ If unsure, say Y.
+
+config MD_LLBITMAP
+ bool "MD RAID lockless bitmap support"
+ depends on BLK_DEV_MD
+ help
+ If you say Y here, support for the lockless write intent bitmap will
+ be enabled.
+
+ Note, this is an experimental feature.
+
+ If unsure, say N.
+
config MD_AUTODETECT
bool "Autodetect RAID arrays during kernel boot"
depends on BLK_DEV_MD=y
@@ -54,6 +80,7 @@ config MD_AUTODETECT
config MD_BITMAP_FILE
bool "MD bitmap file support (deprecated)"
default y
+ depends on MD_BITMAP
help
If you say Y here, support for write intent bitmaps in files on an
external file system is enabled. This is an alternative to the internal
@@ -61,6 +88,19 @@ config MD_BITMAP_FILE
various kernel APIs and can only work with files on a file system not
actually sitting on the MD device.
+config MD_LINEAR
+ tristate "Linear (append) mode"
+ depends on BLK_DEV_MD
+ help
+ If you say Y here, then your multiple devices driver will be able to
+ use the so-called linear mode, i.e. it will combine the hard disk
+ partitions by simply appending one to the other.
+
+ To compile this as a module, choose M here: the module
+ will be called linear.
+
+ If unsure, say Y.
+
config MD_RAID0
tristate "RAID-0 (striping) mode"
depends on BLK_DEV_MD
@@ -126,7 +166,7 @@ config MD_RAID456
tristate "RAID-4/RAID-5/RAID-6 mode"
depends on BLK_DEV_MD
select RAID6_PQ
- select LIBCRC32C
+ select CRC32
select ASYNC_MEMCPY
select ASYNC_XOR
select ASYNC_PQ
@@ -161,6 +201,7 @@ config MD_RAID456
config MD_CLUSTER
tristate "Cluster Support for MD"
+ select MD_BITMAP
depends on BLK_DEV_MD
depends on DLM
default n
@@ -254,6 +295,7 @@ config DM_CRYPT
depends on BLK_DEV_DM
depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
depends on (TRUSTED_KEYS || TRUSTED_KEYS=n)
+ select CRC32
select CRYPTO
select CRYPTO_CBC
select CRYPTO_ESSIV
@@ -379,6 +421,7 @@ config DM_RAID
select MD_RAID1
select MD_RAID10
select MD_RAID456
+ select MD_BITMAP
select BLK_DEV_MD
help
A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings
@@ -645,4 +688,6 @@ config DM_AUDIT
source "drivers/md/dm-vdo/Kconfig"
+source "drivers/md/dm-pcache/Kconfig"
+
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 476a214e4bdc..c338cc6fbe2e 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -27,14 +27,18 @@ dm-clone-y += dm-clone-target.o dm-clone-metadata.o
dm-verity-y += dm-verity-target.o
dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
-md-mod-y += md.o md-bitmap.o
+md-mod-y += md.o
+md-mod-$(CONFIG_MD_BITMAP) += md-bitmap.o
+md-mod-$(CONFIG_MD_LLBITMAP) += md-llbitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
+linear-y += md-linear.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
# themselves, and md.o may use the personalities when it
# auto-initialised.
+obj-$(CONFIG_MD_LINEAR) += linear.o
obj-$(CONFIG_MD_RAID0) += raid0.o
obj-$(CONFIG_MD_RAID1) += raid1.o
obj-$(CONFIG_MD_RAID10) += raid10.o
@@ -69,6 +73,7 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o
obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_VDO) += dm-vdo/
+obj-$(CONFIG_DM_PCACHE) += dm-pcache/
obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_EBS) += dm-ebs.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index d4697e79d5a3..b2d10063d35f 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -5,7 +5,6 @@ config BCACHE
select BLOCK_HOLDER_DEPRECATED if SYSFS
select CRC64
select CLOSURES
- select MIN_HEAP
help
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 8998e61efa40..7708d92df23e 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -24,21 +24,18 @@
* Since the gens and priorities are all stored contiguously on disk, we can
* batch this up: We fill up the free_inc list with freshly invalidated buckets,
* call prio_write(), and when prio_write() finishes we pull buckets off the
- * free_inc list and optionally discard them.
+ * free_inc list.
*
* free_inc isn't the only freelist - if it was, we'd often to sleep while
* priorities and gens were being written before we could allocate. c->free is a
* smaller freelist, and buckets on that list are always ready to be used.
*
- * If we've got discards enabled, that happens when a bucket moves from the
- * free_inc list to the free list.
- *
* There is another freelist, because sometimes we have buckets that we know
* have nothing pointing into them - these we can reuse without waiting for
* priorities to be rewritten. These come from freed btree nodes and buckets
* that garbage collection discovered no longer had valid keys pointing into
* them (because they were overwritten). That's the unused list - buckets on the
- * unused list move to the free list, optionally being discarded in the process.
+ * unused list move to the free list.
*
* It's also important to ensure that gens don't wrap around - with respect to
* either the oldest gen in the btree or the gen on disk. This is quite
@@ -118,8 +115,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
/*
* Background allocation thread: scans for buckets to be invalidated,
* invalidates them, rewrites prios/gens (marking them as invalidated on disk),
- * then optionally issues discard commands to the newly free buckets, then puts
- * them on the various freelists.
+ * then puts them on the various freelists.
*/
static inline bool can_inc_bucket_gen(struct bucket *b)
@@ -164,61 +160,40 @@ static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
* prio is worth 1/8th of what INITIAL_PRIO is worth.
*/
-static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b)
-{
- unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;
-
- return (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b);
-}
-
-static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args)
-{
- struct bucket **lhs = (struct bucket **)l;
- struct bucket **rhs = (struct bucket **)r;
- struct cache *ca = args;
-
- return new_bucket_prio(ca, *lhs) > new_bucket_prio(ca, *rhs);
-}
-
-static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args)
-{
- struct bucket **lhs = (struct bucket **)l;
- struct bucket **rhs = (struct bucket **)r;
- struct cache *ca = args;
+#define bucket_prio(b) \
+({ \
+ unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \
+ \
+ (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \
+})
- return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs);
-}
+#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
+#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
static void invalidate_buckets_lru(struct cache *ca)
{
struct bucket *b;
- const struct min_heap_callbacks bucket_max_cmp_callback = {
- .less = new_bucket_max_cmp,
- .swp = NULL,
- };
- const struct min_heap_callbacks bucket_min_cmp_callback = {
- .less = new_bucket_min_cmp,
- .swp = NULL,
- };
+ ssize_t i;
- ca->heap.nr = 0;
+ ca->heap.used = 0;
for_each_bucket(b, ca) {
if (!bch_can_invalidate_bucket(ca, b))
continue;
- if (!min_heap_full(&ca->heap))
- min_heap_push(&ca->heap, &b, &bucket_max_cmp_callback, ca);
- else if (!new_bucket_max_cmp(&b, min_heap_peek(&ca->heap), ca)) {
+ if (!heap_full(&ca->heap))
+ heap_add(&ca->heap, b, bucket_max_cmp);
+ else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
ca->heap.data[0] = b;
- min_heap_sift_down(&ca->heap, 0, &bucket_max_cmp_callback, ca);
+ heap_sift(&ca->heap, 0, bucket_max_cmp);
}
}
- min_heapify_all(&ca->heap, &bucket_min_cmp_callback, ca);
+ for (i = ca->heap.used / 2 - 1; i >= 0; --i)
+ heap_sift(&ca->heap, i, bucket_min_cmp);
while (!fifo_full(&ca->free_inc)) {
- if (!ca->heap.nr) {
+ if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
/*
* We don't want to be calling invalidate_buckets()
* multiple times when it can't do anything
@@ -227,8 +202,6 @@ static void invalidate_buckets_lru(struct cache *ca)
wake_up_gc(ca->set);
return;
}
- b = min_heap_peek(&ca->heap)[0];
- min_heap_pop(&ca->heap, &bucket_min_cmp_callback, ca);
bch_invalidate_one_bucket(ca, b);
}
@@ -344,8 +317,7 @@ static int bch_allocator_thread(void *arg)
while (1) {
/*
* First, we pull buckets off of the unused and free_inc lists,
- * possibly issue discards to them, then we add the bucket to
- * the free list:
+ * then we add the bucket to the free list:
*/
while (1) {
long bucket;
@@ -353,14 +325,6 @@ static int bch_allocator_thread(void *arg)
if (!fifo_pop(&ca->free_inc, bucket))
break;
- if (ca->discard) {
- mutex_unlock(&ca->set->bucket_lock);
- blkdev_issue_discard(ca->bdev,
- bucket_to_sector(ca->set, bucket),
- ca->sb.bucket_size, GFP_KERNEL);
- mutex_lock(&ca->set->bucket_lock);
- }
-
allocator_wait(ca, bch_allocator_push(ca, bucket));
wake_up(&ca->set->btree_cache_wait);
wake_up(&ca->set->bucket_wait);
@@ -435,7 +399,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
TASK_UNINTERRUPTIBLE);
mutex_unlock(&ca->set->bucket_lock);
+
+ atomic_inc(&ca->set->bucket_wait_cnt);
schedule();
+ atomic_dec(&ca->set->bucket_wait_cnt);
+
mutex_lock(&ca->set->bucket_lock);
} while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
!fifo_pop(&ca->free[reserve], r));
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 785b0d9008fa..8ccacba85547 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -447,8 +447,7 @@ struct cache {
* free_inc: Incoming buckets - these are buckets that currently have
* cached data in them, and we can't reuse them until after we write
* their new gen to disk. After prio_write() finishes writing the new
- * gens/prios, they'll be moved to the free list (and possibly discarded
- * in the process)
+ * gens/prios, they'll be moved to the free list.
*/
DECLARE_FIFO(long, free)[RESERVE_NR];
DECLARE_FIFO(long, free_inc);
@@ -458,7 +457,7 @@ struct cache {
/* Allocation stuff: */
struct bucket *buckets;
- DEFINE_MIN_HEAP(struct bucket *, cache_heap) heap;
+ DECLARE_HEAP(struct bucket *, heap);
/*
* If nonzero, we know we aren't going to find any buckets to invalidate
@@ -467,8 +466,6 @@ struct cache {
*/
unsigned int invalidate_needs_gc;
- bool discard; /* Get rid of? */
-
struct journal_device journal;
/* The rest of this all shows up in sysfs */
@@ -607,6 +604,7 @@ struct cache_set {
*/
atomic_t prio_blocked;
wait_queue_head_t bucket_wait;
+ atomic_t bucket_wait_cnt;
/*
* For any bio we don't skip we subtract the number of sectors from
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 68258a16e125..463eb13bd0b2 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -54,11 +54,9 @@ void bch_dump_bucket(struct btree_keys *b)
int __bch_count_data(struct btree_keys *b)
{
unsigned int ret = 0;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey *k;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
if (b->ops->is_extents)
for_each_key(b, k, &iter)
ret += KEY_SIZE(k);
@@ -69,11 +67,9 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
{
va_list args;
struct bkey *k, *p = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
const char *err;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
for_each_key(b, k, &iter) {
if (b->ops->is_extents) {
err = "Keys out of order";
@@ -114,9 +110,9 @@ bug:
static void bch_btree_iter_next_check(struct btree_iter *iter)
{
- struct bkey *k = iter->heap.data->k, *next = bkey_next(k);
+ struct bkey *k = iter->data->k, *next = bkey_next(k);
- if (next < iter->heap.data->end &&
+ if (next < iter->data->end &&
bkey_cmp(k, iter->b->ops->is_extents ?
&START_KEY(next) : next) > 0) {
bch_dump_bucket(iter->b);
@@ -883,14 +879,12 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
unsigned int status = BTREE_INSERT_STATUS_NO_INSERT;
struct bset *i = bset_tree_last(b)->data;
struct bkey *m, *prev = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey preceding_key_on_stack = ZERO_KEY;
struct bkey *preceding_key_p = &preceding_key_on_stack;
BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
/*
* If k has preceding key, preceding_key_p will be set to address
* of k's preceding key; otherwise preceding_key_p will be set
@@ -901,9 +895,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
else
preceding_key(k, &preceding_key_p);
- m = bch_btree_iter_init(b, &iter, preceding_key_p);
+ m = bch_btree_iter_stack_init(b, &iter, preceding_key_p);
- if (b->ops->insert_fixup(b, k, &iter, replace_key))
+ if (b->ops->insert_fixup(b, k, &iter.iter, replace_key))
return status;
status = BTREE_INSERT_STATUS_INSERT;
@@ -1083,94 +1077,79 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
/* Btree iterator */
-typedef bool (new_btree_iter_cmp_fn)(const void *, const void *, void *);
+typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
+ struct btree_iter_set);
-static inline bool new_btree_iter_cmp(const void *l, const void *r, void __always_unused *args)
+static inline bool btree_iter_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
{
- const struct btree_iter_set *_l = l;
- const struct btree_iter_set *_r = r;
-
- return bkey_cmp(_l->k, _r->k) <= 0;
+ return bkey_cmp(l.k, r.k) > 0;
}
static inline bool btree_iter_end(struct btree_iter *iter)
{
- return !iter->heap.nr;
+ return !iter->used;
}
void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
struct bkey *end)
{
- const struct min_heap_callbacks callbacks = {
- .less = new_btree_iter_cmp,
- .swp = NULL,
- };
-
if (k != end)
- BUG_ON(!min_heap_push(&iter->heap,
- &((struct btree_iter_set) { k, end }),
- &callbacks,
- NULL));
+ BUG_ON(!heap_add(iter,
+ ((struct btree_iter_set) { k, end }),
+ btree_iter_cmp));
}
-static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
- struct btree_iter *iter,
- struct bkey *search,
- struct bset_tree *start)
+static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b,
+ struct btree_iter_stack *iter,
+ struct bkey *search,
+ struct bset_tree *start)
{
struct bkey *ret = NULL;
- iter->heap.size = ARRAY_SIZE(iter->heap.preallocated);
- iter->heap.nr = 0;
+ iter->iter.size = ARRAY_SIZE(iter->stack_data);
+ iter->iter.used = 0;
#ifdef CONFIG_BCACHE_DEBUG
- iter->b = b;
+ iter->iter.b = b;
#endif
for (; start <= bset_tree_last(b); start++) {
ret = bch_bset_search(b, start, search);
- bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
+ bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data));
}
return ret;
}
-struct bkey *bch_btree_iter_init(struct btree_keys *b,
- struct btree_iter *iter,
+struct bkey *bch_btree_iter_stack_init(struct btree_keys *b,
+ struct btree_iter_stack *iter,
struct bkey *search)
{
- return __bch_btree_iter_init(b, iter, search, b->set);
+ return __bch_btree_iter_stack_init(b, iter, search, b->set);
}
static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
- new_btree_iter_cmp_fn *cmp)
+ btree_iter_cmp_fn *cmp)
{
struct btree_iter_set b __maybe_unused;
struct bkey *ret = NULL;
- const struct min_heap_callbacks callbacks = {
- .less = cmp,
- .swp = NULL,
- };
if (!btree_iter_end(iter)) {
bch_btree_iter_next_check(iter);
- ret = iter->heap.data->k;
- iter->heap.data->k = bkey_next(iter->heap.data->k);
+ ret = iter->data->k;
+ iter->data->k = bkey_next(iter->data->k);
- if (iter->heap.data->k > iter->heap.data->end) {
+ if (iter->data->k > iter->data->end) {
WARN_ONCE(1, "bset was corrupt!\n");
- iter->heap.data->k = iter->heap.data->end;
+ iter->data->k = iter->data->end;
}
- if (iter->heap.data->k == iter->heap.data->end) {
- if (iter->heap.nr) {
- b = min_heap_peek(&iter->heap)[0];
- min_heap_pop(&iter->heap, &callbacks, NULL);
- }
- }
+ if (iter->data->k == iter->data->end)
+ heap_pop(iter, b, cmp);
else
- min_heap_sift_down(&iter->heap, 0, &callbacks, NULL);
+ heap_sift(iter, 0, cmp);
}
return ret;
@@ -1178,7 +1157,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
struct bkey *bch_btree_iter_next(struct btree_iter *iter)
{
- return __bch_btree_iter_next(iter, new_btree_iter_cmp);
+ return __bch_btree_iter_next(iter, btree_iter_cmp);
}
@@ -1216,18 +1195,16 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out,
struct btree_iter *iter,
bool fixup, bool remove_stale)
{
+ int i;
struct bkey *k, *last = NULL;
BKEY_PADDED(k) tmp;
bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
? bch_ptr_bad
: bch_ptr_invalid;
- const struct min_heap_callbacks callbacks = {
- .less = b->ops->sort_cmp,
- .swp = NULL,
- };
/* Heapify the iterator, using our comparison function */
- min_heapify_all(&iter->heap, &callbacks, NULL);
+ for (i = iter->used / 2 - 1; i >= 0; --i)
+ heap_sift(iter, i, b->ops->sort_cmp);
while (!btree_iter_end(iter)) {
if (b->ops->sort_fixup && fixup)
@@ -1316,11 +1293,10 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
struct bset_sort_state *state)
{
size_t order = b->page_order, keys = 0;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
int oldsize = bch_count_data(b);
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
- __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
+ __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]);
if (start) {
unsigned int i;
@@ -1331,7 +1307,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
order = get_order(__set_bytes(b->set->data, keys));
}
- __btree_sort(b, &iter, start, order, false, state);
+ __btree_sort(b, &iter.iter, start, order, false, state);
EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
}
@@ -1347,13 +1323,11 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
struct bset_sort_state *state)
{
uint64_t start_time = local_clock();
- struct btree_iter iter;
-
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+ struct btree_iter_stack iter;
- bch_btree_iter_init(b, &iter, NULL);
+ bch_btree_iter_stack_init(b, &iter, NULL);
- btree_mergesort(b, new->set->data, &iter, false, true);
+ btree_mergesort(b, new->set->data, &iter.iter, false, true);
bch_time_stats_update(&state->time, start_time);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index f79441acd4c1..6ee2c6a506a2 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -187,9 +187,8 @@ struct bset_tree {
};
struct btree_keys_ops {
- bool (*sort_cmp)(const void *l,
- const void *r,
- void *args);
+ bool (*sort_cmp)(struct btree_iter_set l,
+ struct btree_iter_set r);
struct bkey *(*sort_fixup)(struct btree_iter *iter,
struct bkey *tmp);
bool (*insert_fixup)(struct btree_keys *b,
@@ -313,18 +312,28 @@ enum {
BTREE_INSERT_STATUS_FRONT_MERGE,
};
-struct btree_iter_set {
- struct bkey *k, *end;
-};
-
/* Btree key iteration */
struct btree_iter {
+ size_t size, used;
#ifdef CONFIG_BCACHE_DEBUG
struct btree_keys *b;
#endif
- MIN_HEAP_PREALLOCATED(struct btree_iter_set, btree_iter_heap, MAX_BSETS) heap;
+ struct btree_iter_set {
+ struct bkey *k, *end;
+ } data[];
+};
+
+/* Fixed-size btree_iter that can be allocated on the stack */
+
+struct btree_iter_stack {
+ /* Must be last as it ends in a flexible-array member. */
+ TRAILING_OVERLAP(struct btree_iter, iter, data,
+ struct btree_iter_set stack_data[MAX_BSETS];
+ );
};
+static_assert(offsetof(struct btree_iter_stack, iter.data) ==
+ offsetof(struct btree_iter_stack, stack_data));
typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k);
@@ -335,9 +344,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
struct bkey *end);
-struct bkey *bch_btree_iter_init(struct btree_keys *b,
- struct btree_iter *iter,
- struct bkey *search);
+struct bkey *bch_btree_iter_stack_init(struct btree_keys *b,
+ struct btree_iter_stack *iter,
+ struct bkey *search);
struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
const struct bkey *search);
@@ -352,13 +361,14 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b,
return search ? __bch_bset_search(b, t, search) : t->data->start;
}
-#define for_each_key_filter(b, k, iter, filter) \
- for (bch_btree_iter_init((b), (iter), NULL); \
- ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+#define for_each_key_filter(b, k, stack_iter, filter) \
+ for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \
+ ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \
+ filter));)
-#define for_each_key(b, k, iter) \
- for (bch_btree_iter_init((b), (iter), NULL); \
- ((k) = bch_btree_iter_next(iter));)
+#define for_each_key(b, k, stack_iter) \
+ for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \
+ ((k) = bch_btree_iter_next(&((stack_iter)->iter)));)
/* Sorting */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index ed40d8600656..3ed39c823826 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -36,6 +36,7 @@
#include <linux/sched/clock.h>
#include <linux/rculist.h>
#include <linux/delay.h>
+#include <linux/sort.h>
#include <trace/events/bcache.h>
/*
@@ -88,10 +89,9 @@
* Test module load/unload
*/
-#define MAX_NEED_GC 64
-#define MAX_SAVE_PRIO 72
-#define MAX_GC_TIMES 100
-#define MIN_GC_NODES 100
+#define MAX_GC_TIMES_SHIFT 7 /* 128 loops */
+#define GC_NODES_MIN 10
+#define GC_SLEEP_MS_MIN 10
#define GC_SLEEP_MS 100
#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
@@ -149,19 +149,19 @@ void bch_btree_node_read_done(struct btree *b)
{
const char *err = "bad btree header";
struct bset *i = btree_bset_first(b);
- struct btree_iter iter;
+ struct btree_iter *iter;
/*
* c->fill_iter can allocate an iterator with more memory space
* than static MAX_BSETS.
* See the comment arount cache_set->fill_iter.
*/
- iter.heap.data = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
- iter.heap.size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
- iter.heap.nr = 0;
+ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
+ iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
+ iter->used = 0;
#ifdef CONFIG_BCACHE_DEBUG
- iter.b = &b->keys;
+ iter->b = &b->keys;
#endif
if (!i->seq)
@@ -199,7 +199,7 @@ void bch_btree_node_read_done(struct btree *b)
if (i != b->keys.set[0].data && !i->keys)
goto err;
- bch_btree_iter_push(&iter, i->start, bset_bkey_last(i));
+ bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
b->written += set_blocks(i, block_bytes(b->c->cache));
}
@@ -211,7 +211,7 @@ void bch_btree_node_read_done(struct btree *b)
if (i->seq == b->keys.set[0].data->seq)
goto err;
- bch_btree_sort_and_fix_extents(&b->keys, &iter, &b->c->sort);
+ bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
i = b->keys.set[0].data;
err = "short btree key";
@@ -223,7 +223,7 @@ void bch_btree_node_read_done(struct btree *b)
bch_bset_init_next(&b->keys, write_block(b),
bset_magic(&b->c->cache->sb));
out:
- mempool_free(iter.heap.data, &b->c->fill_iter);
+ mempool_free(iter, &b->c->fill_iter);
return;
err:
set_btree_node_io_error(b);
@@ -372,7 +372,7 @@ static void do_btree_node_write(struct btree *b)
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
bset_sector_offset(&b->keys, i));
- if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
+ if (!bch_bio_alloc_pages(b->bio, GFP_NOWAIT)) {
struct bio_vec *bv;
void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
struct bvec_iter_all iter_all;
@@ -559,8 +559,6 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
}
}
-#define cmp_int(l, r) ((l > r) - (l < r))
-
#ifdef CONFIG_PROVE_LOCKING
static int btree_lock_cmp_fn(const struct lockdep_map *_a,
const struct lockdep_map *_b)
@@ -1309,11 +1307,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
uint8_t stale = 0;
unsigned int keys = 0, good_keys = 0;
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bset_tree *t;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
gc->nodes++;
for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
@@ -1572,11 +1568,9 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
static unsigned int btree_gc_count_keys(struct btree *b)
{
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
unsigned int ret = 0;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
ret += bkey_u64s(k);
@@ -1585,29 +1579,29 @@ static unsigned int btree_gc_count_keys(struct btree *b)
static size_t btree_gc_min_nodes(struct cache_set *c)
{
- size_t min_nodes;
+ size_t min_nodes = GC_NODES_MIN;
- /*
- * Since incremental GC would stop 100ms when front
- * side I/O comes, so when there are many btree nodes,
- * if GC only processes constant (100) nodes each time,
- * GC would last a long time, and the front side I/Os
- * would run out of the buckets (since no new bucket
- * can be allocated during GC), and be blocked again.
- * So GC should not process constant nodes, but varied
- * nodes according to the number of btree nodes, which
- * realized by dividing GC into constant(100) times,
- * so when there are many btree nodes, GC can process
- * more nodes each time, otherwise, GC will process less
- * nodes each time (but no less than MIN_GC_NODES)
- */
- min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
- if (min_nodes < MIN_GC_NODES)
- min_nodes = MIN_GC_NODES;
+ if (atomic_read(&c->search_inflight) == 0) {
+ size_t n = c->gc_stats.nodes >> MAX_GC_TIMES_SHIFT;
+
+ if (min_nodes < n)
+ min_nodes = n;
+ }
return min_nodes;
}
+static uint64_t btree_gc_sleep_ms(struct cache_set *c)
+{
+ uint64_t sleep_ms;
+
+ if (atomic_read(&c->bucket_wait_cnt) > 0)
+ sleep_ms = GC_SLEEP_MS_MIN;
+ else
+ sleep_ms = GC_SLEEP_MS;
+
+ return sleep_ms;
+}
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc)
@@ -1615,18 +1609,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
int ret = 0;
bool should_rewrite;
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct gc_merge_info r[GC_MERGE_NODES];
struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
- bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
+ bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done);
for (i = r; i < r + ARRAY_SIZE(r); i++)
i->b = ERR_PTR(-EINTR);
while (1) {
- k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
+ k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
+ bch_ptr_bad);
if (k) {
r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
true, b);
@@ -1675,8 +1669,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
r->b = NULL;
- if (atomic_read(&b->c->search_inflight) &&
- gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+ if (gc->nodes >= (gc->nodes_pre + btree_gc_min_nodes(b->c))) {
gc->nodes_pre = gc->nodes;
ret = -EAGAIN;
break;
@@ -1853,8 +1846,8 @@ static void bch_btree_gc(struct cache_set *c)
cond_resched();
if (ret == -EAGAIN)
- schedule_timeout_interruptible(msecs_to_jiffies
- (GC_SLEEP_MS));
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(btree_gc_sleep_ms(c)));
else if (ret)
pr_warn("gc failed!\n");
} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -1921,9 +1914,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
{
int ret = 0;
struct bkey *k, *p = NULL;
- struct btree_iter iter;
-
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+ struct btree_iter_stack iter;
for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
bch_initial_mark_key(b->c, b->level, k);
@@ -1931,10 +1922,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
bch_initial_mark_key(b->c, b->level + 1, &b->key);
if (b->level) {
- bch_btree_iter_init(&b->keys, &iter, NULL);
+ bch_btree_iter_stack_init(&b->keys, &iter, NULL);
do {
- k = bch_btree_iter_next_filter(&iter, &b->keys,
+ k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
bch_ptr_bad);
if (k) {
btree_node_prefetch(b, k);
@@ -1962,7 +1953,7 @@ static int bch_btree_check_thread(void *arg)
struct btree_check_info *info = arg;
struct btree_check_state *check_state = info->state;
struct cache_set *c = check_state->c;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey *k, *p;
int cur_idx, prev_idx, skip_nr;
@@ -1970,11 +1961,9 @@ static int bch_btree_check_thread(void *arg)
cur_idx = prev_idx = 0;
ret = 0;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
/* root node keys are checked before thread created */
- bch_btree_iter_init(&c->root->keys, &iter, NULL);
- k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+ bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
+ k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
BUG_ON(!k);
p = k;
@@ -1992,7 +1981,7 @@ static int bch_btree_check_thread(void *arg)
skip_nr = cur_idx - prev_idx;
while (skip_nr) {
- k = bch_btree_iter_next_filter(&iter,
+ k = bch_btree_iter_next_filter(&iter.iter,
&c->root->keys,
bch_ptr_bad);
if (k)
@@ -2065,11 +2054,9 @@ int bch_btree_check(struct cache_set *c)
int ret = 0;
int i;
struct bkey *k = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct btree_check_state check_state;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
/* check and mark root node keys */
for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
bch_initial_mark_key(c, c->root->level, k);
@@ -2563,12 +2550,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
if (b->level) {
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
- bch_btree_iter_init(&b->keys, &iter, from);
+ bch_btree_iter_stack_init(&b->keys, &iter, from);
- while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
+ while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
bch_ptr_bad))) {
ret = bcache_btree(map_nodes_recurse, k, b,
op, from, fn, flags);
@@ -2597,12 +2583,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
{
int ret = MAP_CONTINUE;
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
- bch_btree_iter_init(&b->keys, &iter, from);
+ bch_btree_iter_stack_init(&b->keys, &iter, from);
- while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
+ while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
+ bch_ptr_bad))) {
ret = !b->level
? fn(op, b, k)
: bcache_btree(map_keys_recurse, k,
@@ -2836,7 +2822,8 @@ void bch_btree_exit(void)
int __init bch_btree_init(void)
{
- btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0);
+ btree_io_wq = alloc_workqueue("bch_btree_io",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!btree_io_wq)
return -ENOMEM;
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 7510d1c983a5..f327456fc4e0 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
check = bio_kmalloc(nr_segs, GFP_NOIO);
if (!check)
return;
- bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs,
- REQ_OP_READ);
+ bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ);
check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
check->bi_iter.bi_size = bio->bi_iter.bi_size;
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 4b84fda1530a..d626ffcbecb9 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -33,16 +33,15 @@ static void sort_key_next(struct btree_iter *iter,
i->k = bkey_next(i->k);
if (i->k == i->end)
- *i = iter->heap.data[--iter->heap.nr];
+ *i = iter->data[--iter->used];
}
-static bool new_bch_key_sort_cmp(const void *l, const void *r, void *args)
+static bool bch_key_sort_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
{
- struct btree_iter_set *_l = (struct btree_iter_set *)l;
- struct btree_iter_set *_r = (struct btree_iter_set *)r;
- int64_t c = bkey_cmp(_l->k, _r->k);
+ int64_t c = bkey_cmp(l.k, r.k);
- return !(c ? c > 0 : _l->k < _r->k);
+ return c ? c > 0 : l.k < r.k;
}
static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
@@ -239,7 +238,7 @@ static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
}
const struct btree_keys_ops bch_btree_keys_ops = {
- .sort_cmp = new_bch_key_sort_cmp,
+ .sort_cmp = bch_key_sort_cmp,
.insert_fixup = bch_btree_ptr_insert_fixup,
.key_invalid = bch_btree_ptr_invalid,
.key_bad = bch_btree_ptr_bad,
@@ -256,28 +255,22 @@ const struct btree_keys_ops bch_btree_keys_ops = {
* Necessary for btree_sort_fixup() - if there are multiple keys that compare
* equal in different sets, we have to process them newest to oldest.
*/
-
-static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_unused *args)
+static bool bch_extent_sort_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
{
- struct btree_iter_set *_l = (struct btree_iter_set *)l;
- struct btree_iter_set *_r = (struct btree_iter_set *)r;
- int64_t c = bkey_cmp(&START_KEY(_l->k), &START_KEY(_r->k));
+ int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
- return !(c ? c > 0 : _l->k < _r->k);
+ return c ? c > 0 : l.k < r.k;
}
static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
struct bkey *tmp)
{
- const struct min_heap_callbacks callbacks = {
- .less = new_bch_extent_sort_cmp,
- .swp = NULL,
- };
- while (iter->heap.nr > 1) {
- struct btree_iter_set *top = iter->heap.data, *i = top + 1;
-
- if (iter->heap.nr > 2 &&
- !new_bch_extent_sort_cmp(&i[0], &i[1], NULL))
+ while (iter->used > 1) {
+ struct btree_iter_set *top = iter->data, *i = top + 1;
+
+ if (iter->used > 2 &&
+ bch_extent_sort_cmp(i[0], i[1]))
i++;
if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
@@ -285,7 +278,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
if (!KEY_SIZE(i->k)) {
sort_key_next(iter, i);
- min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL);
+ heap_sift(iter, i - top, bch_extent_sort_cmp);
continue;
}
@@ -295,7 +288,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
else
bch_cut_front(top->k, i->k);
- min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL);
+ heap_sift(iter, i - top, bch_extent_sort_cmp);
} else {
/* can't happen because of comparison func */
BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
@@ -305,7 +298,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
bch_cut_back(&START_KEY(i->k), tmp);
bch_cut_front(i->k, top->k);
- min_heap_sift_down(&iter->heap, 0, &callbacks, NULL);
+ heap_sift(iter, 0, bch_extent_sort_cmp);
return tmp;
} else {
@@ -625,7 +618,7 @@ static bool bch_extent_merge(struct btree_keys *bk,
}
const struct btree_keys_ops bch_extent_keys_ops = {
- .sort_cmp = new_bch_extent_sort_cmp,
+ .sort_cmp = bch_extent_sort_cmp,
.sort_fixup = bch_extent_sort_fixup,
.insert_fixup = bch_extent_insert_fixup,
.key_invalid = bch_extent_invalid,
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 020712c5203f..2386d08bf4e4 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
struct bio *bio = &b->bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
- meta_bucket_pages(&c->cache->sb), 0);
+ bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0);
return bio;
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 7ff14bd2feb8..144693b7c46a 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -275,8 +275,7 @@ bsearch:
* ja->cur_idx
*/
ja->cur_idx = i;
- ja->last_idx = ja->discard_idx = (i + 1) %
- ca->sb.njournal_buckets;
+ ja->last_idx = (i + 1) % ca->sb.njournal_buckets;
}
@@ -336,16 +335,6 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
}
}
-static bool is_discard_enabled(struct cache_set *s)
-{
- struct cache *ca = s->cache;
-
- if (ca->discard)
- return true;
-
- return false;
-}
-
int bch_journal_replay(struct cache_set *s, struct list_head *list)
{
int ret = 0, keys = 0, entries = 0;
@@ -360,15 +349,10 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
BUG_ON(i->pin && atomic_read(i->pin) != 1);
if (n != i->j.seq) {
- if (n == start && is_discard_enabled(s))
- pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n",
- n, i->j.seq - 1, start, end);
- else {
- pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
- n, i->j.seq - 1, start, end);
- ret = -EIO;
- goto err;
- }
+ pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
+ n, i->j.seq - 1, start, end);
+ ret = -EIO;
+ goto err;
}
for (k = i->j.start;
@@ -568,65 +552,6 @@ out:
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
-static void journal_discard_endio(struct bio *bio)
-{
- struct journal_device *ja =
- container_of(bio, struct journal_device, discard_bio);
- struct cache *ca = container_of(ja, struct cache, journal);
-
- atomic_set(&ja->discard_in_flight, DISCARD_DONE);
-
- closure_wake_up(&ca->set->journal.wait);
- closure_put(&ca->set->cl);
-}
-
-static void journal_discard_work(struct work_struct *work)
-{
- struct journal_device *ja =
- container_of(work, struct journal_device, discard_work);
-
- submit_bio(&ja->discard_bio);
-}
-
-static void do_journal_discard(struct cache *ca)
-{
- struct journal_device *ja = &ca->journal;
- struct bio *bio = &ja->discard_bio;
-
- if (!ca->discard) {
- ja->discard_idx = ja->last_idx;
- return;
- }
-
- switch (atomic_read(&ja->discard_in_flight)) {
- case DISCARD_IN_FLIGHT:
- return;
-
- case DISCARD_DONE:
- ja->discard_idx = (ja->discard_idx + 1) %
- ca->sb.njournal_buckets;
-
- atomic_set(&ja->discard_in_flight, DISCARD_READY);
- fallthrough;
-
- case DISCARD_READY:
- if (ja->discard_idx == ja->last_idx)
- return;
-
- atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
-
- bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD);
- bio->bi_iter.bi_sector = bucket_to_sector(ca->set,
- ca->sb.d[ja->discard_idx]);
- bio->bi_iter.bi_size = bucket_bytes(ca);
- bio->bi_end_io = journal_discard_endio;
-
- closure_get(&ca->set->cl);
- INIT_WORK(&ja->discard_work, journal_discard_work);
- queue_work(bch_journal_wq, &ja->discard_work);
- }
-}
-
static unsigned int free_journal_buckets(struct cache_set *c)
{
struct journal *j = &c->journal;
@@ -635,10 +560,10 @@ static unsigned int free_journal_buckets(struct cache_set *c)
unsigned int n;
/* In case njournal_buckets is not power of 2 */
- if (ja->cur_idx >= ja->discard_idx)
- n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx;
+ if (ja->cur_idx >= ja->last_idx)
+ n = ca->sb.njournal_buckets + ja->last_idx - ja->cur_idx;
else
- n = ja->discard_idx - ja->cur_idx;
+ n = ja->last_idx - ja->cur_idx;
if (n > (1 + j->do_reserve))
return n - (1 + j->do_reserve);
@@ -668,8 +593,6 @@ static void journal_reclaim(struct cache_set *c)
ja->last_idx = (ja->last_idx + 1) %
ca->sb.njournal_buckets;
- do_journal_discard(ca);
-
if (c->journal.blocks_free)
goto out;
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index cd316b4a1e95..9e9d1b3016a5 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -139,19 +139,6 @@ struct journal_device {
/* Last journal bucket that still contains an open journal entry */
unsigned int last_idx;
- /* Next journal bucket to be discarded */
- unsigned int discard_idx;
-
-#define DISCARD_READY 0
-#define DISCARD_IN_FLIGHT 1
-#define DISCARD_DONE 2
- /* 1 - discard in flight, -1 - discard completed */
- atomic_t discard_in_flight;
-
- struct work_struct discard_work;
- struct bio discard_bio;
- struct bio_vec discard_bv;
-
/* Bio for journal reads/writes to this device */
struct bio bio;
struct bio_vec bv[8];
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index ef6abf33f926..73918e55bf04 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -79,10 +79,10 @@ static void moving_init(struct moving_io *io)
{
struct bio *bio = &io->bio.bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
+ bio_init_inline(bio, NULL,
DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0);
bio_get(bio);
- bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9;
bio->bi_private = &io->cl;
@@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c)
continue;
}
- io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
- GFP_KERNEL);
+ io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
if (!io)
goto err;
@@ -182,19 +182,16 @@ err: if (!IS_ERR_OR_NULL(w->private))
closure_sync(&cl);
}
-static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *args)
+static bool bucket_cmp(struct bucket *l, struct bucket *r)
{
- struct bucket **_l = (struct bucket **)l;
- struct bucket **_r = (struct bucket **)r;
-
- return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r);
+ return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
}
static unsigned int bucket_heap_top(struct cache *ca)
{
struct bucket *b;
- return (b = min_heap_peek(&ca->heap)[0]) ? GC_SECTORS_USED(b) : 0;
+ return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
}
void bch_moving_gc(struct cache_set *c)
@@ -202,10 +199,6 @@ void bch_moving_gc(struct cache_set *c)
struct cache *ca = c->cache;
struct bucket *b;
unsigned long sectors_to_move, reserve_sectors;
- const struct min_heap_callbacks callbacks = {
- .less = new_bucket_cmp,
- .swp = NULL,
- };
if (!c->copy_gc_enabled)
return;
@@ -216,7 +209,7 @@ void bch_moving_gc(struct cache_set *c)
reserve_sectors = ca->sb.bucket_size *
fifo_used(&ca->free[RESERVE_MOVINGGC]);
- ca->heap.nr = 0;
+ ca->heap.used = 0;
for_each_bucket(b, ca) {
if (GC_MARK(b) == GC_MARK_METADATA ||
@@ -225,31 +218,25 @@ void bch_moving_gc(struct cache_set *c)
atomic_read(&b->pin))
continue;
- if (!min_heap_full(&ca->heap)) {
+ if (!heap_full(&ca->heap)) {
sectors_to_move += GC_SECTORS_USED(b);
- min_heap_push(&ca->heap, &b, &callbacks, NULL);
- } else if (!new_bucket_cmp(&b, min_heap_peek(&ca->heap), ca)) {
+ heap_add(&ca->heap, b, bucket_cmp);
+ } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
sectors_to_move -= bucket_heap_top(ca);
sectors_to_move += GC_SECTORS_USED(b);
ca->heap.data[0] = b;
- min_heap_sift_down(&ca->heap, 0, &callbacks, NULL);
+ heap_sift(&ca->heap, 0, bucket_cmp);
}
}
while (sectors_to_move > reserve_sectors) {
- if (ca->heap.nr) {
- b = min_heap_peek(&ca->heap)[0];
- min_heap_pop(&ca->heap, &callbacks, NULL);
- }
+ heap_pop(&ca->heap, b, bucket_cmp);
sectors_to_move -= GC_SECTORS_USED(b);
}
- while (ca->heap.nr) {
- b = min_heap_peek(&ca->heap)[0];
- min_heap_pop(&ca->heap, &callbacks, NULL);
+ while (heap_pop(&ca->heap, b, bucket_cmp))
SET_GC_MOVE(b, 1);
- }
mutex_unlock(&c->bucket_lock);
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 68b02216033d..0056106495a7 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -123,7 +123,7 @@ void bch_cache_accounting_destroy(struct cache_accounting *acc)
kobject_put(&acc->day.kobj);
atomic_set(&acc->closing, 1);
- if (del_timer_sync(&acc->timer))
+ if (timer_delete_sync(&acc->timer))
closure_return(&acc->cl);
}
@@ -149,7 +149,7 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
static void scale_accounting(struct timer_list *t)
{
- struct cache_accounting *acc = from_timer(acc, t, timer);
+ struct cache_accounting *acc = timer_container_of(acc, t, timer);
#define move_stat(name) do { \
unsigned int t = atomic_xchg(&acc->collector.name, 0); \
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e7abfdd77c3b..c17d4517af22 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -168,14 +168,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
{
const char *err;
struct cache_sb_disk *s;
- struct page *page;
+ struct folio *folio;
unsigned int i;
- page = read_cache_page_gfp(bdev->bd_mapping,
- SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
- if (IS_ERR(page))
+ folio = mapping_read_folio_gfp(bdev->bd_mapping,
+ SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
+ if (IS_ERR(folio))
return "IO error";
- s = page_address(page) + offset_in_page(SB_OFFSET);
+ s = folio_address(folio) + offset_in_folio(folio, SB_OFFSET);
sb->offset = le64_to_cpu(s->offset);
sb->version = le64_to_cpu(s->version);
@@ -272,7 +272,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
*res = s;
return NULL;
err:
- put_page(page);
+ folio_put(folio);
return err;
}
@@ -293,8 +293,7 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
bio->bi_iter.bi_sector = SB_SECTOR;
- __bio_add_page(bio, virt_to_page(out), SB_SIZE,
- offset_in_page(out));
+ bio_add_virt_nofail(bio, out, SB_SIZE);
out->offset = cpu_to_le64(sb->offset);
@@ -546,7 +545,8 @@ static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
static struct uuid_entry *uuid_find_empty(struct cache_set *c)
{
- static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+ static const char zero_uuid[16] __nonstring =
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
return uuid_find(c, zero_uuid);
}
@@ -1366,7 +1366,7 @@ static CLOSURE_CALLBACK(cached_dev_free)
mutex_unlock(&bch_register_lock);
if (dc->sb_disk)
- put_page(virt_to_page(dc->sb_disk));
+ folio_put(virt_to_folio(dc->sb_disk));
if (dc->bdev_file)
fput(dc->bdev_file);
@@ -1388,7 +1388,7 @@ static CLOSURE_CALLBACK(cached_dev_flush)
bch_cache_accounting_destroy(&dc->accounting);
kobject_del(&d->kobj);
- continue_at(cl, cached_dev_free, system_wq);
+ continue_at(cl, cached_dev_free, system_percpu_wq);
}
static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
@@ -1400,7 +1400,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
__module_get(THIS_MODULE);
INIT_LIST_HEAD(&dc->list);
closure_init(&dc->disk.cl, NULL);
- set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
+ set_closure_fn(&dc->disk.cl, cached_dev_flush, system_percpu_wq);
kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
INIT_WORK(&dc->detach, cached_dev_detach_finish);
sema_init(&dc->sb_write_mutex, 1);
@@ -1513,7 +1513,7 @@ static CLOSURE_CALLBACK(flash_dev_flush)
bcache_device_unlink(d);
mutex_unlock(&bch_register_lock);
kobject_del(&d->kobj);
- continue_at(cl, flash_dev_free, system_wq);
+ continue_at(cl, flash_dev_free, system_percpu_wq);
}
static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
@@ -1525,7 +1525,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
goto err_ret;
closure_init(&d->cl, NULL);
- set_closure_fn(&d->cl, flash_dev_flush, system_wq);
+ set_closure_fn(&d->cl, flash_dev_flush, system_percpu_wq);
kobject_init(&d->kobj, &bch_flash_dev_ktype);
@@ -1718,7 +1718,7 @@ static CLOSURE_CALLBACK(cache_set_flush)
if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
- if (!IS_ERR(c->root))
+ if (!IS_ERR_OR_NULL(c->root))
list_add(&c->root->list, &c->btree_cache);
/*
@@ -1733,7 +1733,12 @@ static CLOSURE_CALLBACK(cache_set_flush)
mutex_unlock(&b->write_lock);
}
- if (ca->alloc_thread)
+ /*
+ * If the register_cache_set() call to bch_cache_set_alloc() failed,
+ * ca has not been assigned a value and return error.
+ * So we need check ca is not NULL during bch_cache_set_unregister().
+ */
+ if (ca && ca->alloc_thread)
kthread_stop(ca->alloc_thread);
if (c->journal.cur) {
@@ -1828,7 +1833,7 @@ static CLOSURE_CALLBACK(__cache_set_unregister)
mutex_unlock(&bch_register_lock);
- continue_at(cl, cache_set_flush, system_wq);
+ continue_at(cl, cache_set_flush, system_percpu_wq);
}
void bch_cache_set_stop(struct cache_set *c)
@@ -1858,10 +1863,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
__module_get(THIS_MODULE);
closure_init(&c->cl, NULL);
- set_closure_fn(&c->cl, cache_set_free, system_wq);
+ set_closure_fn(&c->cl, cache_set_free, system_percpu_wq);
closure_init(&c->caching, &c->cl);
- set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
+ set_closure_fn(&c->caching, __cache_set_unregister, system_percpu_wq);
/* Maybe create continue_at_noreturn() and use it here? */
closure_set_stopped(&c->cl);
@@ -1907,7 +1912,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->data_buckets);
- iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) *
+ iter_size = sizeof(struct btree_iter) +
+ ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) *
sizeof(struct btree_iter_set);
c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
@@ -1933,7 +1939,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
if (!c->uuids)
goto err;
- c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
+ c->moving_gc_wq = alloc_workqueue("bcache_gc",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!c->moving_gc_wq)
goto err;
@@ -2210,7 +2217,7 @@ void bch_cache_release(struct kobject *kobj)
free_fifo(&ca->free[i]);
if (ca->sb_disk)
- put_page(virt_to_page(ca->sb_disk));
+ folio_put(virt_to_folio(ca->sb_disk));
if (ca->bdev_file)
fput(ca->bdev_file);
@@ -2230,18 +2237,50 @@ static int cache_alloc(struct cache *ca)
__module_get(THIS_MODULE);
kobject_init(&ca->kobj, &bch_cache_ktype);
- bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0);
+ bio_init_inline(&ca->journal.bio, NULL, 8, 0);
/*
- * when ca->sb.njournal_buckets is not zero, journal exists,
- * and in bch_journal_replay(), tree node may split,
- * so bucket of RESERVE_BTREE type is needed,
- * the worst situation is all journal buckets are valid journal,
- * and all the keys need to replay,
- * so the number of RESERVE_BTREE type buckets should be as much
- * as journal buckets
+ * When the cache disk is first registered, ca->sb.njournal_buckets
+ * is zero, and it is assigned in run_cache_set().
+ *
+ * When ca->sb.njournal_buckets is not zero, journal exists,
+ * and in bch_journal_replay(), tree node may split.
+ * The worst situation is all journal buckets are valid journal,
+ * and all the keys need to replay, so the number of RESERVE_BTREE
+ * type buckets should be as much as journal buckets.
+ *
+ * If the number of RESERVE_BTREE type buckets is too few, the
+ * bch_allocator_thread() may hang up and unable to allocate
+ * bucket. The situation is roughly as follows:
+ *
+ * 1. In bch_data_insert_keys(), if the operation is not op->replace,
+ * it will call the bch_journal(), which increments the journal_ref
+ * counter. This counter is only decremented after bch_btree_insert
+ * completes.
+ *
+ * 2. When calling bch_btree_insert, if the btree needs to split,
+ * it will call btree_split() and btree_check_reserve() to check
+ * whether there are enough reserved buckets in the RESERVE_BTREE
+ * slot. If not enough, bcache_btree_root() will repeatedly retry.
+ *
+ * 3. Normally, the bch_allocator_thread is responsible for filling
+ * the reservation slots from the free_inc bucket list. When the
+ * free_inc bucket list is exhausted, the bch_allocator_thread
+ * will call invalidate_buckets() until free_inc is refilled.
+ * Then bch_allocator_thread calls bch_prio_write() once. and
+ * bch_prio_write() will call bch_journal_meta() and waits for
+ * the journal write to complete.
+ *
+ * 4. During journal_write, journal_write_unlocked() is be called.
+ * If journal full occurs, journal_reclaim() and btree_flush_write()
+ * will be called sequentially, then retry journal_write.
+ *
+ * 5. When 2 and 4 occur together, IO will hung up and cannot recover.
+ *
+ * Therefore, reserve more RESERVE_BTREE type buckets.
*/
- btree_buckets = ca->sb.njournal_buckets ?: 8;
+ btree_buckets = clamp_t(size_t, ca->sb.nbuckets >> 7,
+ 32, SB_JOURNAL_BUCKETS);
free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
if (!free) {
ret = -EPERM;
@@ -2344,9 +2383,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
ca->bdev = file_bdev(bdev_file);
ca->sb_disk = sb_disk;
- if (bdev_max_discard_sectors(file_bdev(bdev_file)))
- ca->discard = CACHE_DISCARD(&ca->sb);
-
ret = cache_alloc(ca);
if (ret != 0) {
if (ret == -ENOMEM)
@@ -2493,7 +2529,7 @@ static void register_device_async(struct async_reg_args *args)
INIT_DELAYED_WORK(&args->reg_work, register_cache_worker);
/* 10 jiffies is enough for a delay */
- queue_delayed_work(system_wq, &args->reg_work, 10);
+ queue_delayed_work(system_percpu_wq, &args->reg_work, 10);
}
static void *alloc_holder_object(struct cache_sb *sb)
@@ -2555,7 +2591,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!holder) {
ret = -ENOMEM;
err = "cannot allocate memory";
- goto out_put_sb_page;
+ goto out_put_sb_folio;
}
/* Now reopen in exclusive mode with proper holder */
@@ -2629,8 +2665,8 @@ async_done:
out_free_holder:
kfree(holder);
-out_put_sb_page:
- put_page(virt_to_page(sb_disk));
+out_put_sb_folio:
+ folio_put(virt_to_folio(sb_disk));
out_blkdev_put:
if (bdev_file)
fput(bdev_file);
@@ -2867,24 +2903,25 @@ static int __init bcache_init(void)
if (bch_btree_init())
goto err;
- bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
+ bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!bcache_wq)
goto err;
/*
* Let's not make this `WQ_MEM_RECLAIM` for the following reasons:
*
- * 1. It used `system_wq` before which also does no memory reclaim.
+ * 1. It used `system_percpu_wq` before which also does no memory reclaim.
* 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and
* reduced throughput can be observed.
*
- * We still want to user our own queue to not congest the `system_wq`.
+ * We still want to user our own queue to not congest the `system_percpu_wq`.
*/
- bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
+ bch_flush_wq = alloc_workqueue("bch_flush", WQ_PERCPU, 0);
if (!bch_flush_wq)
goto err;
- bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
+ bch_journal_wq = alloc_workqueue("bch_journal",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!bch_journal_wq)
goto err;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index e8f696cb58c0..72f38e5b6f5c 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -134,7 +134,6 @@ read_attribute(partial_stripes_expensive);
rw_attribute(synchronous);
rw_attribute(journal_delay_ms);
rw_attribute(io_disable);
-rw_attribute(discard);
rw_attribute(running);
rw_attribute(label);
rw_attribute(errors);
@@ -660,9 +659,7 @@ static unsigned int bch_root_usage(struct cache_set *c)
unsigned int bytes = 0;
struct bkey *k;
struct btree *b;
- struct btree_iter iter;
-
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+ struct btree_iter_stack iter;
goto lock_root;
@@ -1038,7 +1035,6 @@ SHOW(__bch_cache)
sysfs_hprint(bucket_size, bucket_bytes(ca));
sysfs_hprint(block_size, block_bytes(ca));
sysfs_print(nbuckets, ca->sb.nbuckets);
- sysfs_print(discard, ca->discard);
sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9);
sysfs_hprint(btree_written,
atomic_long_read(&ca->btree_sectors_written) << 9);
@@ -1144,18 +1140,6 @@ STORE(__bch_cache)
if (bcache_is_reboot)
return -EBUSY;
- if (attr == &sysfs_discard) {
- bool v = strtoul_or_return(buf);
-
- if (bdev_max_discard_sectors(ca->bdev))
- ca->discard = v;
-
- if (v != CACHE_DISCARD(&ca->sb)) {
- SET_CACHE_DISCARD(&ca->sb, v);
- bcache_write_super(ca->set);
- }
- }
-
if (attr == &sysfs_cache_replacement_policy) {
v = __sysfs_match_string(cache_replacement_policies, -1, buf);
if (v < 0)
@@ -1187,7 +1171,6 @@ static struct attribute *bch_cache_attrs[] = {
&sysfs_block_size,
&sysfs_nbuckets,
&sysfs_priority_stats,
- &sysfs_discard,
&sysfs_written,
&sysfs_btree_written,
&sysfs_metadata_written,
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 539454d8e2d0..f61ab1bada6c 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -9,7 +9,6 @@
#include <linux/kernel.h>
#include <linux/sched/clock.h>
#include <linux/llist.h>
-#include <linux/min_heap.h>
#include <linux/ratelimit.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
@@ -31,10 +30,16 @@ struct closure;
#endif
+#define DECLARE_HEAP(type, name) \
+ struct { \
+ size_t size, used; \
+ type *data; \
+ } name
+
#define init_heap(heap, _size, gfp) \
({ \
size_t _bytes; \
- (heap)->nr = 0; \
+ (heap)->used = 0; \
(heap)->size = (_size); \
_bytes = (heap)->size * sizeof(*(heap)->data); \
(heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \
@@ -47,6 +52,64 @@ do { \
(heap)->data = NULL; \
} while (0)
+#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j])
+
+#define heap_sift(h, i, cmp) \
+do { \
+ size_t _r, _j = i; \
+ \
+ for (; _j * 2 + 1 < (h)->used; _j = _r) { \
+ _r = _j * 2 + 1; \
+ if (_r + 1 < (h)->used && \
+ cmp((h)->data[_r], (h)->data[_r + 1])) \
+ _r++; \
+ \
+ if (cmp((h)->data[_r], (h)->data[_j])) \
+ break; \
+ heap_swap(h, _r, _j); \
+ } \
+} while (0)
+
+#define heap_sift_down(h, i, cmp) \
+do { \
+ while (i) { \
+ size_t p = (i - 1) / 2; \
+ if (cmp((h)->data[i], (h)->data[p])) \
+ break; \
+ heap_swap(h, i, p); \
+ i = p; \
+ } \
+} while (0)
+
+#define heap_add(h, d, cmp) \
+({ \
+ bool _r = !heap_full(h); \
+ if (_r) { \
+ size_t _i = (h)->used++; \
+ (h)->data[_i] = d; \
+ \
+ heap_sift_down(h, _i, cmp); \
+ heap_sift(h, _i, cmp); \
+ } \
+ _r; \
+})
+
+#define heap_pop(h, d, cmp) \
+({ \
+ bool _r = (h)->used; \
+ if (_r) { \
+ (d) = (h)->data[0]; \
+ (h)->used--; \
+ heap_swap(h, 0, (h)->used); \
+ heap_sift(h, 0, cmp); \
+ } \
+ _r; \
+})
+
+#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL)
+
+#define heap_full(h) ((h)->used == (h)->size)
+
#define DECLARE_FIFO(type, name) \
struct { \
size_t front, back, size, mask; \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index c1d28e365910..4b237074f453 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -331,10 +331,10 @@ static void dirty_init(struct keybuf_key *w)
struct dirty_io *io = w->private;
struct bio *bio = &io->bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
+ bio_init_inline(bio, NULL,
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
if (!io->dc->writeback_percent)
- bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9;
bio->bi_private = w;
@@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc)
for (i = 0; i < nk; i++) {
w = keys[i];
- io = kzalloc(struct_size(io, bio.bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
- GFP_KERNEL);
+ io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
if (!io)
goto err;
@@ -805,8 +805,7 @@ static int bch_writeback_thread(void *arg)
* may set BCH_ENABLE_AUTO_GC via sysfs, then when
* BCH_DO_AUTO_GC is set, garbage collection thread
* will be wake up here. After moving gc, the shrunk
- * btree and discarded free buckets SSD space may be
- * helpful for following write requests.
+ * btree may be helpful for following write requests.
*/
if (c->gc_after_writeback ==
(BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
@@ -908,16 +907,15 @@ static int bch_dirty_init_thread(void *arg)
struct dirty_init_thrd_info *info = arg;
struct bch_dirty_init_state *state = info->state;
struct cache_set *c = state->c;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey *k, *p;
int cur_idx, prev_idx, skip_nr;
k = p = NULL;
prev_idx = 0;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
- bch_btree_iter_init(&c->root->keys, &iter, NULL);
- k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+ bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
+ k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
BUG_ON(!k);
p = k;
@@ -931,7 +929,7 @@ static int bch_dirty_init_thread(void *arg)
skip_nr = cur_idx - prev_idx;
while (skip_nr) {
- k = bch_btree_iter_next_filter(&iter,
+ k = bch_btree_iter_next_filter(&iter.iter,
&c->root->keys,
bch_ptr_bad);
if (k)
@@ -980,13 +978,11 @@ void bch_sectors_dirty_init(struct bcache_device *d)
int i;
struct btree *b = NULL;
struct bkey *k = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct sectors_dirty_init op;
struct cache_set *c = d->c;
struct bch_dirty_init_state state;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
retry_lock:
b = c->root;
rw_lock(0, b, b->level);
@@ -1079,7 +1075,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
int bch_cached_dev_writeback_start(struct cached_dev *dc)
{
dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
- WQ_MEM_RECLAIM, 0);
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!dc->writeback_write_wq)
return -ENOMEM;
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index aab8240429b0..e6d28be11c5c 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -41,16 +41,6 @@
#define DM_BUFIO_LOW_WATERMARK_RATIO 16
/*
- * Check buffer ages in this interval (seconds)
- */
-#define DM_BUFIO_WORK_TIMER_SECS 30
-
-/*
- * Free buffers when they are older than this (seconds)
- */
-#define DM_BUFIO_DEFAULT_AGE_SECS 300
-
-/*
* The nr of bytes of cached data to keep around.
*/
#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
@@ -68,6 +58,8 @@
#define LIST_DIRTY 1
#define LIST_SIZE 2
+#define SCAN_RESCHED_CYCLE 16
+
/*--------------------------------------------------------------*/
/*
@@ -1055,10 +1047,8 @@ static unsigned long dm_bufio_cache_size_latch;
static DEFINE_SPINLOCK(global_spinlock);
-/*
- * Buffers are freed after this timeout
- */
-static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
+static unsigned int dm_bufio_max_age; /* No longer does anything */
+
static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
static unsigned long dm_bufio_peak_allocated;
@@ -1086,7 +1076,6 @@ static LIST_HEAD(dm_bufio_all_clients);
static DEFINE_MUTEX(dm_bufio_clients_lock);
static struct workqueue_struct *dm_bufio_wq;
-static struct delayed_work dm_bufio_cleanup_old_work;
static struct work_struct dm_bufio_replacement_work;
@@ -1348,12 +1337,12 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
char *ptr;
unsigned int len;
- bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
+ bio = bio_kmalloc(1, GFP_NOWAIT);
if (!bio) {
use_dmio(b, op, sector, n_sectors, offset, ioprio);
return;
}
- bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op);
+ bio_init_inline(bio, b->c->bdev, 1, op);
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = bio_complete;
bio->bi_private = b;
@@ -1362,7 +1351,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
ptr = (char *)b->data + offset;
len = n_sectors << SECTOR_SHIFT;
- __bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
+ bio_add_virt_nofail(bio, ptr, len);
submit_bio(bio);
}
@@ -1612,18 +1601,18 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
* dm-bufio is resistant to allocation failures (it just keeps
* one buffer reserved in cases all the allocations fail).
* So set flags to not try too hard:
- * GFP_NOWAIT: don't wait; if we need to sleep we'll release our
- * mutex and wait ourselves.
+ * GFP_NOWAIT: don't wait and don't print a warning in case of
+ * failure; if we need to sleep we'll release our mutex
+ * and wait ourselves.
* __GFP_NORETRY: don't retry and rather return failure
* __GFP_NOMEMALLOC: don't use emergency reserves
- * __GFP_NOWARN: don't print a warning in case of failure
*
* For debugging, if we set the cache size to 1, no new buffers will
* be allocated.
*/
while (1) {
if (dm_bufio_cache_size_latch != 1) {
- b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC);
if (b)
return b;
}
@@ -2234,7 +2223,7 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c
}
EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
-static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
+static void forget_buffer(struct dm_bufio_client *c, sector_t block)
{
struct dm_buffer *b;
@@ -2249,8 +2238,6 @@ static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
cache_put_and_wake(c, b);
}
}
-
- return b ? true : false;
}
/*
@@ -2426,7 +2413,12 @@ static void __scan(struct dm_bufio_client *c)
atomic_long_dec(&c->need_shrink);
freed++;
- cond_resched();
+
+ if (unlikely(freed % SCAN_RESCHED_CYCLE == 0)) {
+ dm_bufio_unlock(c);
+ cond_resched();
+ dm_bufio_lock(c);
+ }
}
}
}
@@ -2675,130 +2667,6 @@ EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
/*--------------------------------------------------------------*/
-static unsigned int get_max_age_hz(void)
-{
- unsigned int max_age = READ_ONCE(dm_bufio_max_age);
-
- if (max_age > UINT_MAX / HZ)
- max_age = UINT_MAX / HZ;
-
- return max_age * HZ;
-}
-
-static bool older_than(struct dm_buffer *b, unsigned long age_hz)
-{
- return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz);
-}
-
-struct evict_params {
- gfp_t gfp;
- unsigned long age_hz;
-
- /*
- * This gets updated with the largest last_accessed (ie. most
- * recently used) of the evicted buffers. It will not be reinitialised
- * by __evict_many(), so you can use it across multiple invocations.
- */
- unsigned long last_accessed;
-};
-
-/*
- * We may not be able to evict this buffer if IO pending or the client
- * is still using it.
- *
- * And if GFP_NOFS is used, we must not do any I/O because we hold
- * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
- * rerouted to different bufio client.
- */
-static enum evict_result select_for_evict(struct dm_buffer *b, void *context)
-{
- struct evict_params *params = context;
-
- if (!(params->gfp & __GFP_FS) ||
- (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
- if (test_bit_acquire(B_READING, &b->state) ||
- test_bit(B_WRITING, &b->state) ||
- test_bit(B_DIRTY, &b->state))
- return ER_DONT_EVICT;
- }
-
- return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP;
-}
-
-static unsigned long __evict_many(struct dm_bufio_client *c,
- struct evict_params *params,
- int list_mode, unsigned long max_count)
-{
- unsigned long count;
- unsigned long last_accessed;
- struct dm_buffer *b;
-
- for (count = 0; count < max_count; count++) {
- b = cache_evict(&c->cache, list_mode, select_for_evict, params);
- if (!b)
- break;
-
- last_accessed = READ_ONCE(b->last_accessed);
- if (time_after_eq(params->last_accessed, last_accessed))
- params->last_accessed = last_accessed;
-
- __make_buffer_clean(b);
- __free_buffer_wake(b);
-
- cond_resched();
- }
-
- return count;
-}
-
-static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
-{
- struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0};
- unsigned long retain = get_retain_buffers(c);
- unsigned long count;
- LIST_HEAD(write_list);
-
- dm_bufio_lock(c);
-
- __check_watermark(c, &write_list);
- if (unlikely(!list_empty(&write_list))) {
- dm_bufio_unlock(c);
- __flush_write_list(&write_list);
- dm_bufio_lock(c);
- }
-
- count = cache_total(&c->cache);
- if (count > retain)
- __evict_many(c, &params, LIST_CLEAN, count - retain);
-
- dm_bufio_unlock(c);
-}
-
-static void cleanup_old_buffers(void)
-{
- unsigned long max_age_hz = get_max_age_hz();
- struct dm_bufio_client *c;
-
- mutex_lock(&dm_bufio_clients_lock);
-
- __cache_size_refresh();
-
- list_for_each_entry(c, &dm_bufio_all_clients, client_list)
- evict_old_buffers(c, max_age_hz);
-
- mutex_unlock(&dm_bufio_clients_lock);
-}
-
-static void work_fn(struct work_struct *w)
-{
- cleanup_old_buffers();
-
- queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
- DM_BUFIO_WORK_TIMER_SECS * HZ);
-}
-
-/*--------------------------------------------------------------*/
-
/*
* Global cleanup tries to evict the oldest buffers from across _all_
* the clients. It does this by repeatedly evicting a few buffers from
@@ -2836,27 +2704,55 @@ static void __insert_client(struct dm_bufio_client *new_client)
list_add_tail(&new_client->client_list, h);
}
+static enum evict_result select_for_evict(struct dm_buffer *b, void *context)
+{
+ /* In no-sleep mode, we cannot wait on IO. */
+ if (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep) {
+ if (test_bit_acquire(B_READING, &b->state) ||
+ test_bit(B_WRITING, &b->state) ||
+ test_bit(B_DIRTY, &b->state))
+ return ER_DONT_EVICT;
+ }
+ return ER_EVICT;
+}
+
static unsigned long __evict_a_few(unsigned long nr_buffers)
{
- unsigned long count;
struct dm_bufio_client *c;
- struct evict_params params = {
- .gfp = GFP_KERNEL,
- .age_hz = 0,
- /* set to jiffies in case there are no buffers in this client */
- .last_accessed = jiffies
- };
+ unsigned long oldest_buffer = jiffies;
+ unsigned long last_accessed;
+ unsigned long count;
+ struct dm_buffer *b;
c = __pop_client();
if (!c)
return 0;
dm_bufio_lock(c);
- count = __evict_many(c, &params, LIST_CLEAN, nr_buffers);
+
+ for (count = 0; count < nr_buffers; count++) {
+ b = cache_evict(&c->cache, LIST_CLEAN, select_for_evict, NULL);
+ if (!b)
+ break;
+
+ last_accessed = READ_ONCE(b->last_accessed);
+ if (time_after_eq(oldest_buffer, last_accessed))
+ oldest_buffer = last_accessed;
+
+ __make_buffer_clean(b);
+ __free_buffer_wake(b);
+
+ if (need_resched()) {
+ dm_bufio_unlock(c);
+ cond_resched();
+ dm_bufio_lock(c);
+ }
+ }
+
dm_bufio_unlock(c);
if (count)
- c->oldest_buffer = params.last_accessed;
+ c->oldest_buffer = oldest_buffer;
__insert_client(c);
return count;
@@ -2939,10 +2835,7 @@ static int __init dm_bufio_init(void)
if (!dm_bufio_wq)
return -ENOMEM;
- INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
- queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
- DM_BUFIO_WORK_TIMER_SECS * HZ);
return 0;
}
@@ -2954,7 +2847,6 @@ static void __exit dm_bufio_exit(void)
{
int bug = 0;
- cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
destroy_workqueue(dm_bufio_wq);
if (dm_bufio_client_count) {
@@ -2991,7 +2883,7 @@ module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644);
MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644);
-MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
+MODULE_PARM_DESC(max_age_seconds, "No longer does anything");
module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644);
MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 2ed894155cab..7e1e8cc0e33a 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -590,7 +590,7 @@ static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned in
nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
ht->hash_bits = __ffs(nr_buckets);
- ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
+ ht->buckets = vmalloc_array(nr_buckets, sizeof(*ht->buckets));
if (!ht->buckets)
return -ENOMEM;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9cb797a561d6..a10d75a562db 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -406,6 +406,12 @@ struct cache {
mempool_t migration_pool;
struct bio_set bs;
+
+ /*
+ * Cache_size entries. Set bits indicate blocks mapped beyond the
+ * target length, which are marked for invalidation.
+ */
+ unsigned long *invalid_bitset;
};
struct per_bio_data {
@@ -1922,6 +1928,9 @@ static void __destroy(struct cache *cache)
if (cache->discard_bitset)
free_bitset(cache->discard_bitset);
+ if (cache->invalid_bitset)
+ free_bitset(cache->invalid_bitset);
+
if (cache->copier)
dm_kcopyd_client_destroy(cache->copier);
@@ -2510,6 +2519,13 @@ static int cache_create(struct cache_args *ca, struct cache **result)
}
clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
+ cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size));
+ if (!cache->invalid_bitset) {
+ *error = "could not allocate bitset for invalid blocks";
+ goto bad;
+ }
+ clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size));
+
cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(cache->copier)) {
*error = "could not create kcopyd client";
@@ -2808,6 +2824,24 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
}
+static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
+ bool dirty, uint32_t hint, bool hint_valid)
+{
+ struct cache *cache = context;
+
+ if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) {
+ if (dirty) {
+ DMERR("%s: unable to shrink origin; cache block %u is dirty",
+ cache_device_name(cache), from_cblock(cblock));
+ return -EFBIG;
+ }
+ set_bit(from_cblock(cblock), cache->invalid_bitset);
+ return 0;
+ }
+
+ return load_mapping(context, oblock, cblock, dirty, hint, hint_valid);
+}
+
/*
* The discard block size in the on disk metadata is not
* necessarily the same as we're currently using. So we have to
@@ -2899,6 +2933,27 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache)
return to_cblock(size);
}
+static bool can_resume(struct cache *cache)
+{
+ /*
+ * Disallow retrying the resume operation for devices that failed the
+ * first resume attempt, as the failure leaves the policy object partially
+ * initialized. Retrying could trigger BUG_ON when loading cache mappings
+ * into the incomplete policy object.
+ */
+ if (cache->sized && !cache->loaded_mappings) {
+ if (get_cache_mode(cache) != CM_WRITE)
+ DMERR("%s: unable to resume a failed-loaded cache, please check metadata.",
+ cache_device_name(cache));
+ else
+ DMERR("%s: unable to resume cache due to missing proper cache table reload",
+ cache_device_name(cache));
+ return false;
+ }
+
+ return true;
+}
+
static bool can_resize(struct cache *cache, dm_cblock_t new_size)
{
if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
@@ -2941,12 +2996,33 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
return 0;
}
+static int truncate_oblocks(struct cache *cache)
+{
+ uint32_t nr_blocks = from_cblock(cache->cache_size);
+ uint32_t i;
+ int r;
+
+ for_each_set_bit(i, cache->invalid_bitset, nr_blocks) {
+ r = dm_cache_remove_mapping(cache->cmd, to_cblock(i));
+ if (r) {
+ DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ return r;
+ }
+ }
+
+ return 0;
+}
+
static int cache_preresume(struct dm_target *ti)
{
int r = 0;
struct cache *cache = ti->private;
dm_cblock_t csize = get_cache_dev_size(cache);
+ if (!can_resume(cache))
+ return -EINVAL;
+
/*
* Check to see if the cache has resized.
*/
@@ -2962,11 +3038,25 @@ static int cache_preresume(struct dm_target *ti)
}
if (!cache->loaded_mappings) {
+ /*
+ * The fast device could have been resized since the last
+ * failed preresume attempt. To be safe we start by a blank
+ * bitset for cache blocks.
+ */
+ clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size));
+
r = dm_cache_load_mappings(cache->cmd, cache->policy,
- load_mapping, cache);
+ load_filtered_mapping, cache);
if (r) {
DMERR("%s: could not load cache mappings", cache_device_name(cache));
- metadata_operation_failed(cache, "dm_cache_load_mappings", r);
+ if (r != -EFBIG)
+ metadata_operation_failed(cache, "dm_cache_load_mappings", r);
+ return r;
+ }
+
+ r = truncate_oblocks(cache);
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
return r;
}
@@ -3426,7 +3516,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {2, 2, 0},
+ .version = {2, 3, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 3637761f3585..a3c9f74fe2dc 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -141,6 +141,7 @@ struct mapped_device {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_zones;
void *zone_revalidate_map;
+ struct task_struct *revalidate_map_task;
#endif
#ifdef CONFIG_IMA
@@ -161,9 +162,7 @@ struct mapped_device {
#define DMF_SUSPENDED_INTERNALLY 7
#define DMF_POST_SUSPENDING 8
#define DMF_EMULATE_ZONE_APPEND 9
-
-void disable_discard(struct mapped_device *md);
-void disable_write_zeroes(struct mapped_device *md);
+#define DMF_QUEUE_STOPPED 10
static inline sector_t dm_get_size(struct mapped_device *md)
{
@@ -293,6 +292,7 @@ struct dm_io {
struct dm_io *next;
struct dm_stats_aux stats_aux;
blk_status_t status;
+ bool requeue_flush_with_data;
atomic_t io_count;
struct mapped_device *md;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 1ae2c71bb383..5ef43231fe77 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -17,6 +17,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
+#include <linux/crc32.h>
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/crypto.h>
@@ -59,6 +60,7 @@ struct convert_context {
struct bio *bio_out;
struct bvec_iter iter_out;
atomic_t cc_pending;
+ unsigned int tag_offset;
u64 cc_sector;
union {
struct skcipher_request *req;
@@ -124,7 +126,6 @@ struct iv_lmk_private {
#define TCW_WHITENING_SIZE 16
struct iv_tcw_private {
- struct crypto_shash *crc32_tfm;
u8 *iv_seed;
u8 *whitening;
};
@@ -252,17 +253,35 @@ MODULE_PARM_DESC(max_read_size, "Maximum size of a read request");
static unsigned int max_write_size = 0;
module_param(max_write_size, uint, 0644);
MODULE_PARM_DESC(max_write_size, "Maximum size of a write request");
-static unsigned get_max_request_size(struct crypt_config *cc, bool wrt)
+
+static unsigned get_max_request_sectors(struct dm_target *ti, struct bio *bio)
{
+ struct crypt_config *cc = ti->private;
unsigned val, sector_align;
- val = !wrt ? READ_ONCE(max_read_size) : READ_ONCE(max_write_size);
- if (likely(!val))
- val = !wrt ? DM_CRYPT_DEFAULT_MAX_READ_SIZE : DM_CRYPT_DEFAULT_MAX_WRITE_SIZE;
- if (wrt || cc->used_tag_size) {
- if (unlikely(val > BIO_MAX_VECS << PAGE_SHIFT))
- val = BIO_MAX_VECS << PAGE_SHIFT;
- }
- sector_align = max(bdev_logical_block_size(cc->dev->bdev), (unsigned)cc->sector_size);
+ bool wrt = op_is_write(bio_op(bio));
+
+ if (wrt) {
+ /*
+ * For zoned devices, splitting write operations creates the
+ * risk of deadlocking queue freeze operations with zone write
+ * plugging BIO work when the reminder of a split BIO is
+ * issued. So always allow the entire BIO to proceed.
+ */
+ if (ti->emulate_zone_append)
+ return bio_sectors(bio);
+
+ val = min_not_zero(READ_ONCE(max_write_size),
+ DM_CRYPT_DEFAULT_MAX_WRITE_SIZE);
+ } else {
+ val = min_not_zero(READ_ONCE(max_read_size),
+ DM_CRYPT_DEFAULT_MAX_READ_SIZE);
+ }
+
+ if (wrt || cc->used_tag_size)
+ val = min(val, BIO_MAX_VECS << PAGE_SHIFT);
+
+ sector_align = max(bdev_logical_block_size(cc->dev->bdev),
+ (unsigned)cc->sector_size);
val = round_down(val, sector_align);
if (unlikely(!val))
val = sector_align;
@@ -516,7 +535,10 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
{
struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
SHASH_DESC_ON_STACK(desc, lmk->hash_tfm);
- struct md5_state md5state;
+ union {
+ struct md5_state md5state;
+ u8 state[CRYPTO_MD5_STATESIZE];
+ } u;
__le32 buf[4];
int i, r;
@@ -547,13 +569,13 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
return r;
/* No MD5 padding here */
- r = crypto_shash_export(desc, &md5state);
+ r = crypto_shash_export(desc, &u.md5state);
if (r)
return r;
for (i = 0; i < MD5_HASH_WORDS; i++)
- __cpu_to_le32s(&md5state.hash[i]);
- memcpy(iv, &md5state.hash, cc->iv_size);
+ __cpu_to_le32s(&u.md5state.hash[i]);
+ memcpy(iv, &u.md5state.hash, cc->iv_size);
return 0;
}
@@ -606,10 +628,6 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc)
tcw->iv_seed = NULL;
kfree_sensitive(tcw->whitening);
tcw->whitening = NULL;
-
- if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
- crypto_free_shash(tcw->crc32_tfm);
- tcw->crc32_tfm = NULL;
}
static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -627,13 +645,6 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
return -EINVAL;
}
- tcw->crc32_tfm = crypto_alloc_shash("crc32", 0,
- CRYPTO_ALG_ALLOCATES_MEMORY);
- if (IS_ERR(tcw->crc32_tfm)) {
- ti->error = "Error initializing CRC32 in TCW";
- return PTR_ERR(tcw->crc32_tfm);
- }
-
tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL);
tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL);
if (!tcw->iv_seed || !tcw->whitening) {
@@ -667,36 +678,28 @@ static int crypt_iv_tcw_wipe(struct crypt_config *cc)
return 0;
}
-static int crypt_iv_tcw_whitening(struct crypt_config *cc,
- struct dm_crypt_request *dmreq,
- u8 *data)
+static void crypt_iv_tcw_whitening(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq, u8 *data)
{
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
__le64 sector = cpu_to_le64(dmreq->iv_sector);
u8 buf[TCW_WHITENING_SIZE];
- SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm);
- int i, r;
+ int i;
/* xor whitening with sector number */
crypto_xor_cpy(buf, tcw->whitening, (u8 *)&sector, 8);
crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)&sector, 8);
/* calculate crc32 for every 32bit part and xor it */
- desc->tfm = tcw->crc32_tfm;
- for (i = 0; i < 4; i++) {
- r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]);
- if (r)
- goto out;
- }
+ for (i = 0; i < 4; i++)
+ put_unaligned_le32(crc32(0, &buf[i * 4], 4), &buf[i * 4]);
crypto_xor(&buf[0], &buf[12], 4);
crypto_xor(&buf[4], &buf[8], 4);
/* apply whitening (8 bytes) to whole sector */
for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
crypto_xor(data + i * 8, buf, 8);
-out:
memzero_explicit(buf, sizeof(buf));
- return r;
}
static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
@@ -706,13 +709,12 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
__le64 sector = cpu_to_le64(dmreq->iv_sector);
u8 *src;
- int r = 0;
/* Remove whitening from ciphertext */
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
sg = crypt_get_sg_data(cc, dmreq->sg_in);
src = kmap_local_page(sg_page(sg));
- r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
+ crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
kunmap_local(src);
}
@@ -722,7 +724,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)&sector,
cc->iv_size - 8);
- return r;
+ return 0;
}
static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
@@ -730,7 +732,6 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
{
struct scatterlist *sg;
u8 *dst;
- int r;
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
return 0;
@@ -738,10 +739,10 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
/* Apply whitening on ciphertext */
sg = crypt_get_sg_data(cc, dmreq->sg_out);
dst = kmap_local_page(sg_page(sg));
- r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
+ crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
kunmap_local(dst);
- return r;
+ return 0;
}
static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
@@ -1187,7 +1188,7 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
tag_len = io->cc->tuple_size * (bio_sectors(bio) >> io->cc->sector_shift);
- bip->bip_iter.bi_sector = io->cc->start + io->sector;
+ bip->bip_iter.bi_sector = bio->bi_iter.bi_sector;
ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
tag_len, offset_in_page(io->integrity_metadata));
@@ -1209,11 +1210,11 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
return -EINVAL;
}
- if (bi->tuple_size < cc->used_tag_size) {
+ if (bi->metadata_size < cc->used_tag_size) {
ti->error = "Integrity profile tag size mismatch.";
return -EINVAL;
}
- cc->tuple_size = bi->tuple_size;
+ cc->tuple_size = bi->metadata_size;
if (1 << bi->interval_exp != cc->sector_size) {
ti->error = "Integrity profile sector size mismatch.";
return -EINVAL;
@@ -1256,6 +1257,7 @@ static void crypt_convert_init(struct crypt_config *cc,
if (bio_out)
ctx->iter_out = bio_out->bi_iter;
ctx->cc_sector = sector + cc->iv_offset;
+ ctx->tag_offset = 0;
init_completion(&ctx->restart);
}
@@ -1588,7 +1590,6 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_
static blk_status_t crypt_convert(struct crypt_config *cc,
struct convert_context *ctx, bool atomic, bool reset_pending)
{
- unsigned int tag_offset = 0;
unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
int r;
@@ -1611,9 +1612,9 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
atomic_inc(&ctx->cc_pending);
if (crypt_integrity_aead(cc))
- r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset);
+ r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, ctx->tag_offset);
else
- r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset);
+ r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, ctx->tag_offset);
switch (r) {
/*
@@ -1633,8 +1634,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
* exit and continue processing in a workqueue
*/
ctx->r.req = NULL;
+ ctx->tag_offset++;
ctx->cc_sector += sector_step;
- tag_offset++;
return BLK_STS_DEV_RESOURCE;
}
} else {
@@ -1648,8 +1649,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
*/
case -EINPROGRESS:
ctx->r.req = NULL;
+ ctx->tag_offset++;
ctx->cc_sector += sector_step;
- tag_offset++;
continue;
/*
* The request was already processed (synchronously).
@@ -1657,7 +1658,7 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
case 0:
atomic_dec(&ctx->cc_pending);
ctx->cc_sector += sector_step;
- tag_offset++;
+ ctx->tag_offset++;
if (!atomic)
cond_resched();
continue;
@@ -1719,6 +1720,7 @@ retry:
clone->bi_private = io;
clone->bi_end_io = crypt_endio;
clone->bi_ioprio = io->base_bio->bi_ioprio;
+ clone->bi_iter.bi_sector = cc->start + io->sector;
remaining_size = size;
@@ -1909,7 +1911,6 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
crypt_dec_pending(io);
return 1;
}
- clone->bi_iter.bi_sector = cc->start + io->sector;
crypt_convert_init(cc, &io->ctx, clone, clone, io->sector);
io->saved_bi_iter = clone->bi_iter;
dm_submit_bio_remap(io->base_bio, clone);
@@ -1925,13 +1926,13 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
clone = bio_alloc_clone(cc->dev->bdev, io->base_bio, gfp, &cc->bs);
if (!clone)
return 1;
+
+ clone->bi_iter.bi_sector = cc->start + io->sector;
clone->bi_private = io;
clone->bi_end_io = crypt_endio;
crypt_inc_pending(io);
- clone->bi_iter.bi_sector = cc->start + io->sector;
-
if (dm_crypt_integrity_io_alloc(io, clone)) {
crypt_dec_pending(io);
bio_put(clone);
@@ -2039,8 +2040,6 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
/* crypt_convert should have filled the clone bio */
BUG_ON(io->ctx.iter_out.bi_size);
- clone->bi_iter.bi_sector = cc->start + io->sector;
-
if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) ||
test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) {
dm_submit_bio_remap(io->base_bio, clone);
@@ -2092,13 +2091,12 @@ static void kcryptd_crypt_write_continue(struct work_struct *work)
struct crypt_config *cc = io->cc;
struct convert_context *ctx = &io->ctx;
int crypt_finished;
- sector_t sector = io->sector;
blk_status_t r;
wait_for_completion(&ctx->restart);
reinit_completion(&ctx->restart);
- r = crypt_convert(cc, &io->ctx, true, false);
+ r = crypt_convert(cc, &io->ctx, false, false);
if (r)
io->error = r;
crypt_finished = atomic_dec_and_test(&ctx->cc_pending);
@@ -2109,10 +2107,8 @@ static void kcryptd_crypt_write_continue(struct work_struct *work)
}
/* Encryption was already finished, submit io now */
- if (crypt_finished) {
+ if (crypt_finished)
kcryptd_crypt_write_io_submit(io, 0);
- io->sector = sector;
- }
crypt_dec_pending(io);
}
@@ -2123,14 +2119,13 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
struct convert_context *ctx = &io->ctx;
struct bio *clone;
int crypt_finished;
- sector_t sector = io->sector;
blk_status_t r;
/*
* Prevent io from disappearing until this function completes.
*/
crypt_inc_pending(io);
- crypt_convert_init(cc, ctx, NULL, io->base_bio, sector);
+ crypt_convert_init(cc, ctx, NULL, io->base_bio, io->sector);
clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
if (unlikely(!clone)) {
@@ -2147,8 +2142,6 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
io->ctx.iter_in = clone->bi_iter;
}
- sector += bio_sectors(clone);
-
crypt_inc_pending(io);
r = crypt_convert(cc, ctx,
test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags), true);
@@ -2172,10 +2165,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
}
/* Encryption was already finished, submit io now */
- if (crypt_finished) {
+ if (crypt_finished)
kcryptd_crypt_write_io_submit(io, 0);
- io->sector = sector;
- }
dec:
crypt_dec_pending(io);
@@ -2203,7 +2194,7 @@ static void kcryptd_crypt_read_continue(struct work_struct *work)
wait_for_completion(&io->ctx.restart);
reinit_completion(&io->ctx.restart);
- r = crypt_convert(cc, &io->ctx, true, false);
+ r = crypt_convert(cc, &io->ctx, false, false);
if (r)
io->error = r;
@@ -2221,7 +2212,6 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
crypt_inc_pending(io);
if (io->ctx.aead_recheck) {
- io->ctx.cc_sector = io->sector + cc->iv_offset;
r = crypt_convert(cc, &io->ctx,
test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true);
} else {
@@ -3524,7 +3514,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
/*
* Check if bio is too large, split as needed.
*/
- max_sectors = get_max_request_size(cc, bio_data_dir(bio) == WRITE);
+ max_sectors = get_max_request_sectors(ti, bio);
if (unlikely(bio_sectors(bio) > max_sectors))
dm_accept_partial_bio(bio, max_sectors);
@@ -3761,6 +3751,17 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
max_t(unsigned int, limits->physical_block_size, cc->sector_size);
limits->io_min = max_t(unsigned int, limits->io_min, cc->sector_size);
limits->dma_alignment = limits->logical_block_size - 1;
+
+ /*
+ * For zoned dm-crypt targets, there will be no internal splitting of
+ * write BIOs to avoid exceeding BIO_MAX_VECS vectors per BIO. But
+ * without respecting this limit, crypt_alloc_buffer() will trigger a
+ * BUG(). Avoid this by forcing DM core to split write BIOs to this
+ * limit.
+ */
+ if (ti->emulate_zone_append)
+ limits->max_hw_sectors = min(limits->max_hw_sectors,
+ BIO_MAX_VECS << PAGE_SECTORS_SHIFT);
}
static struct target_type crypt_target = {
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 08f6387620c1..4bb6553278c7 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -14,11 +14,14 @@
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/kthread.h>
+#include <linux/delay.h>
#include <linux/device-mapper.h>
#define DM_MSG_PREFIX "delay"
+#define SLEEP_SHIFT 3
+
struct delay_class {
struct dm_dev *dev;
sector_t start;
@@ -34,6 +37,7 @@ struct delay_c {
struct work_struct flush_expired_bios;
struct list_head delayed_bios;
struct task_struct *worker;
+ unsigned int worker_sleep_us;
bool may_delay;
struct delay_class read;
@@ -52,7 +56,7 @@ struct dm_delay_info {
static void handle_delayed_timer(struct timer_list *t)
{
- struct delay_c *dc = from_timer(dc, t, delay_timer);
+ struct delay_c *dc = timer_container_of(dc, t, delay_timer);
queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
}
@@ -136,6 +140,7 @@ static int flush_worker_fn(void *data)
schedule();
} else {
spin_unlock(&dc->delayed_bios_lock);
+ fsleep(dc->worker_sleep_us);
cond_resched();
}
}
@@ -212,7 +217,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct delay_c *dc;
int ret;
- unsigned int max_delay;
+ unsigned int max_delay, min_delay;
if (argc != 3 && argc != 6 && argc != 9) {
ti->error = "Requires exactly 3, 6 or 9 arguments";
@@ -235,7 +240,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ret = delay_class_ctr(ti, &dc->read, argv);
if (ret)
goto bad;
- max_delay = dc->read.delay;
+ min_delay = max_delay = dc->read.delay;
if (argc == 3) {
ret = delay_class_ctr(ti, &dc->write, argv);
@@ -251,6 +256,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (ret)
goto bad;
max_delay = max(max_delay, dc->write.delay);
+ min_delay = min_not_zero(min_delay, dc->write.delay);
if (argc == 6) {
ret = delay_class_ctr(ti, &dc->flush, argv + 3);
@@ -263,9 +269,14 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (ret)
goto bad;
max_delay = max(max_delay, dc->flush.delay);
+ min_delay = min_not_zero(min_delay, dc->flush.delay);
out:
if (max_delay < 50) {
+ if (min_delay >> SLEEP_SHIFT)
+ dc->worker_sleep_us = 1000;
+ else
+ dc->worker_sleep_us = (min_delay * 1000) >> SLEEP_SHIFT;
/*
* In case of small requested delays, use kthread instead of
* timers and workqueue to achieve better latency.
@@ -369,6 +380,21 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
return delay_bio(dc, c, bio);
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static int delay_report_zones(struct dm_target *ti,
+ struct dm_report_zones_args *args, unsigned int nr_zones)
+{
+ struct delay_c *dc = ti->private;
+ struct delay_class *c = &dc->read;
+
+ return dm_report_zones(c->dev->bdev, c->start,
+ c->start + dm_target_offset(ti, args->next_sector),
+ args, nr_zones);
+}
+#else
+#define delay_report_zones NULL
+#endif
+
#define DMEMIT_DELAY_CLASS(c) \
DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay)
@@ -423,12 +449,13 @@ out:
static struct target_type delay_target = {
.name = "delay",
- .version = {1, 4, 0},
- .features = DM_TARGET_PASSES_INTEGRITY,
+ .version = {1, 5, 0},
+ .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
.module = THIS_MODULE,
.ctr = delay_ctr,
.dtr = delay_dtr,
.map = delay_map,
+ .report_zones = delay_report_zones,
.presuspend = delay_presuspend,
.resume = delay_resume,
.status = delay_status,
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index 1a33820c9f46..e75310232bbf 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -534,7 +534,9 @@ static void dust_status(struct dm_target *ti, status_type_t type,
}
}
-static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct dust_device *dd = ti->private;
struct dm_dev *dev = dd->dev;
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index ec5db1478b2f..6abb31ca9662 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -390,6 +390,12 @@ static int ebs_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
+static void ebs_postsuspend(struct dm_target *ti)
+{
+ struct ebs_c *ec = ti->private;
+ dm_bufio_client_reset(ec->bufio);
+}
+
static void ebs_status(struct dm_target *ti, status_type_t type,
unsigned int status_flags, char *result, unsigned int maxlen)
{
@@ -409,7 +415,8 @@ static void ebs_status(struct dm_target *ti, status_type_t type,
}
}
-static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg, bool *forward)
{
struct ebs_c *ec = ti->private;
struct dm_dev *dev = ec->dev;
@@ -442,11 +449,12 @@ static int ebs_iterate_devices(struct dm_target *ti,
static struct target_type ebs_target = {
.name = "ebs",
.version = {1, 0, 1},
- .features = DM_TARGET_PASSES_INTEGRITY,
+ .features = 0,
.module = THIS_MODULE,
.ctr = ebs_ctr,
.dtr = ebs_dtr,
.map = ebs_map,
+ .postsuspend = ebs_postsuspend,
.status = ebs_status,
.io_hints = ebs_io_hints,
.prepare_ioctl = ebs_prepare_ioctl,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 731467d4ed10..08925aca838c 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -47,14 +47,15 @@ enum feature_flag_bits {
};
struct per_bio_data {
- bool bio_submitted;
+ bool bio_can_corrupt;
+ struct bvec_iter saved_iter;
};
static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
struct dm_target *ti)
{
- int r;
- unsigned int argc;
+ int r = 0;
+ unsigned int argc = 0;
const char *arg_name;
static const struct dm_arg _args[] = {
@@ -65,14 +66,13 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
{0, PROBABILITY_BASE, "Invalid random corrupt argument"},
};
- /* No feature arguments supplied. */
- if (!as->argc)
- return 0;
-
- r = dm_read_arg_group(_args, as, &argc, &ti->error);
- if (r)
+ if (as->argc && (r = dm_read_arg_group(_args, as, &argc, &ti->error)))
return r;
+ /* No feature arguments supplied. */
+ if (!argc)
+ goto error_all_io;
+
while (argc) {
arg_name = dm_shift_arg(as);
argc--;
@@ -128,8 +128,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
* corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
*/
if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
- if (!argc) {
- ti->error = "Feature corrupt_bio_byte requires parameters";
+ if (fc->corrupt_bio_byte) {
+ ti->error = "Feature corrupt_bio_byte duplicated";
+ return -EINVAL;
+ } else if (argc < 4) {
+ ti->error = "Feature corrupt_bio_byte requires 4 parameters";
return -EINVAL;
}
@@ -176,7 +179,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
}
if (!strcasecmp(arg_name, "random_read_corrupt")) {
- if (!argc) {
+ if (fc->random_read_corrupt) {
+ ti->error = "Feature random_read_corrupt duplicated";
+ return -EINVAL;
+ } else if (!argc) {
ti->error = "Feature random_read_corrupt requires a parameter";
return -EINVAL;
}
@@ -189,7 +195,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
}
if (!strcasecmp(arg_name, "random_write_corrupt")) {
- if (!argc) {
+ if (fc->random_write_corrupt) {
+ ti->error = "Feature random_write_corrupt duplicated";
+ return -EINVAL;
+ } else if (!argc) {
ti->error = "Feature random_write_corrupt requires a parameter";
return -EINVAL;
}
@@ -205,18 +214,28 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
return -EINVAL;
}
- if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
- ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
+ if (test_bit(DROP_WRITES, &fc->flags) &&
+ ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == WRITE) ||
+ fc->random_write_corrupt)) {
+ ti->error = "drop_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set";
return -EINVAL;
- } else if (test_bit(ERROR_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
- ti->error = "error_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
+ } else if (test_bit(ERROR_WRITES, &fc->flags) &&
+ ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == WRITE) ||
+ fc->random_write_corrupt)) {
+ ti->error = "error_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set";
+ return -EINVAL;
+ } else if (test_bit(ERROR_READS, &fc->flags) &&
+ ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == READ) ||
+ fc->random_read_corrupt)) {
+ ti->error = "error_reads is incompatible with random_read_corrupt or corrupt_bio_byte with the READ flag set";
return -EINVAL;
}
if (!fc->corrupt_bio_byte && !test_bit(ERROR_READS, &fc->flags) &&
!test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags) &&
!fc->random_read_corrupt && !fc->random_write_corrupt) {
+error_all_io:
set_bit(ERROR_WRITES, &fc->flags);
set_bit(ERROR_READS, &fc->flags);
}
@@ -278,7 +297,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
- r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error);
+ r = dm_read_arg(_args + 1, &as, &fc->down_interval, &ti->error);
if (r)
goto bad;
@@ -339,7 +358,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
}
static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte,
- unsigned char corrupt_bio_value)
+ unsigned char corrupt_bio_value,
+ struct bvec_iter start)
{
struct bvec_iter iter;
struct bio_vec bvec;
@@ -348,7 +368,7 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte,
* Overwrite the Nth byte of the bio's data, on whichever page
* it falls.
*/
- bio_for_each_segment(bvec, bio, iter) {
+ __bio_for_each_segment(bvec, bio, iter, start) {
if (bio_iter_len(bio, iter) > corrupt_bio_byte) {
unsigned char *segment = bvec_kmap_local(&bvec);
segment[corrupt_bio_byte] = corrupt_bio_value;
@@ -357,36 +377,31 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte,
"(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n",
bio, corrupt_bio_value, corrupt_bio_byte,
(bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf,
- (unsigned long long)bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size);
+ (unsigned long long)start.bi_sector,
+ start.bi_size);
break;
}
corrupt_bio_byte -= bio_iter_len(bio, iter);
}
}
-static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
+static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc,
+ struct bvec_iter start)
{
unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1;
- if (!bio_has_data(bio))
- return;
-
- corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value);
+ corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value, start);
}
-static void corrupt_bio_random(struct bio *bio)
+static void corrupt_bio_random(struct bio *bio, struct bvec_iter start)
{
unsigned int corrupt_byte;
unsigned char corrupt_value;
- if (!bio_has_data(bio))
- return;
-
- corrupt_byte = get_random_u32() % bio->bi_iter.bi_size;
+ corrupt_byte = get_random_u32() % start.bi_size;
corrupt_value = get_random_u8();
- corrupt_bio_common(bio, corrupt_byte, corrupt_value);
+ corrupt_bio_common(bio, corrupt_byte, corrupt_value, start);
}
static void clone_free(struct bio *clone)
@@ -426,7 +441,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b
if (!clone)
return NULL;
- bio_init(clone, fc->dev->bdev, bio->bi_inline_vecs, nr_iovecs, bio->bi_opf);
+ bio_init_inline(clone, fc->dev->bdev, nr_iovecs, bio->bi_opf);
clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector);
clone->bi_private = bio;
@@ -481,7 +496,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
unsigned int elapsed;
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
- pb->bio_submitted = false;
+ pb->bio_can_corrupt = false;
if (op_is_zone_mgmt(bio_op(bio)))
goto map_bio;
@@ -490,14 +505,15 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
elapsed = (jiffies - fc->start_time) / HZ;
if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
bool corrupt_fixed, corrupt_random;
- /*
- * Flag this bio as submitted while down.
- */
- pb->bio_submitted = true;
+
+ if (bio_has_data(bio)) {
+ pb->bio_can_corrupt = true;
+ pb->saved_iter = bio->bi_iter;
+ }
/*
- * Error reads if neither corrupt_bio_byte or drop_writes or error_writes are set.
- * Otherwise, flakey_end_io() will decide if the reads should be modified.
+ * If ERROR_READS isn't set flakey_end_io() will decide if the
+ * reads should be modified.
*/
if (bio_data_dir(bio) == READ) {
if (test_bit(ERROR_READS, &fc->flags))
@@ -516,6 +532,8 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+ if (!pb->bio_can_corrupt)
+ goto map_bio;
/*
* Corrupt matching writes.
*/
@@ -535,9 +553,11 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
struct bio *clone = clone_bio(ti, fc, bio);
if (clone) {
if (corrupt_fixed)
- corrupt_bio_data(clone, fc);
+ corrupt_bio_data(clone, fc,
+ clone->bi_iter);
if (corrupt_random)
- corrupt_bio_random(clone);
+ corrupt_bio_random(clone,
+ clone->bi_iter);
submit_bio(clone);
return DM_MAPIO_SUBMITTED;
}
@@ -559,28 +579,21 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
if (op_is_zone_mgmt(bio_op(bio)))
return DM_ENDIO_DONE;
- if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
+ if (!*error && pb->bio_can_corrupt && (bio_data_dir(bio) == READ)) {
if (fc->corrupt_bio_byte) {
if ((fc->corrupt_bio_rw == READ) &&
all_corrupt_bio_flags_match(bio, fc)) {
/*
* Corrupt successful matching READs while in down state.
*/
- corrupt_bio_data(bio, fc);
+ corrupt_bio_data(bio, fc, pb->saved_iter);
}
}
if (fc->random_read_corrupt) {
u64 rnd = get_random_u64();
u32 rem = do_div(rnd, PROBABILITY_BASE);
if (rem < fc->random_read_corrupt)
- corrupt_bio_random(bio);
- }
- if (test_bit(ERROR_READS, &fc->flags)) {
- /*
- * Error read during the down_interval if drop_writes
- * and error_writes were not configured.
- */
- *error = BLK_STS_IOERR;
+ corrupt_bio_random(bio, pb->saved_iter);
}
}
@@ -638,7 +651,9 @@ static void flakey_status(struct dm_target *ti, status_type_t type,
}
}
-static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct flakey_c *fc = ti->private;
diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c
index b90f34259fbb..efb3cd4f9cd4 100644
--- a/drivers/md/dm-ima.c
+++ b/drivers/md/dm-ima.c
@@ -45,7 +45,7 @@ static void fix_separator_chars(char **buf)
/*
* Internal function to allocate memory for IMA measurements.
*/
-static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio)
+static void *dm_ima_alloc(size_t len, bool noio)
{
unsigned int noio_flag;
void *ptr;
@@ -53,7 +53,7 @@ static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio)
if (noio)
noio_flag = memalloc_noio_save();
- ptr = kzalloc(len, flags);
+ ptr = kzalloc(len, GFP_KERNEL);
if (noio)
memalloc_noio_restore(noio_flag);
@@ -68,13 +68,13 @@ static int dm_ima_alloc_and_copy_name_uuid(struct mapped_device *md, char **dev_
char **dev_uuid, bool noio)
{
int r;
- *dev_name = dm_ima_alloc(DM_NAME_LEN*2, GFP_KERNEL, noio);
+ *dev_name = dm_ima_alloc(DM_NAME_LEN*2, noio);
if (!(*dev_name)) {
r = -ENOMEM;
goto error;
}
- *dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, GFP_KERNEL, noio);
+ *dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, noio);
if (!(*dev_uuid)) {
r = -ENOMEM;
goto error;
@@ -109,7 +109,7 @@ static int dm_ima_alloc_and_copy_device_data(struct mapped_device *md, char **de
if (r)
return r;
- *device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio);
+ *device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio);
if (!(*device_data)) {
r = -ENOMEM;
goto error;
@@ -153,14 +153,12 @@ static int dm_ima_alloc_and_copy_capacity_str(struct mapped_device *md, char **c
capacity = get_capacity(md->disk);
- *capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, GFP_KERNEL, noio);
+ *capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, noio);
if (!(*capacity_str))
return -ENOMEM;
- scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;",
- capacity);
-
- return 0;
+ return scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;",
+ capacity);
}
/*
@@ -195,15 +193,15 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
const size_t hash_alg_prefix_len = strlen(DM_IMA_TABLE_HASH_ALG) + 1;
char table_load_event_name[] = "dm_table_load";
- ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, GFP_KERNEL, noio);
+ ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, noio);
if (!ima_buf)
return;
- target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, GFP_KERNEL, noio);
+ target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, noio);
if (!target_metadata_buf)
goto error;
- target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, GFP_KERNEL, noio);
+ target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, noio);
if (!target_data_buf)
goto error;
@@ -218,7 +216,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
shash->tfm = tfm;
digest_size = crypto_shash_digestsize(tfm);
- digest = dm_ima_alloc(digest_size, GFP_KERNEL, noio);
+ digest = dm_ima_alloc(digest_size, noio);
if (!digest)
goto error;
@@ -241,10 +239,11 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
/*
* First retrieve the target metadata.
*/
- scnprintf(target_metadata_buf, DM_IMA_TARGET_METADATA_BUF_LEN,
- "target_index=%d,target_begin=%llu,target_len=%llu,",
- i, ti->begin, ti->len);
- target_metadata_buf_len = strlen(target_metadata_buf);
+ target_metadata_buf_len =
+ scnprintf(target_metadata_buf,
+ DM_IMA_TARGET_METADATA_BUF_LEN,
+ "target_index=%d,target_begin=%llu,target_len=%llu,",
+ i, ti->begin, ti->len);
/*
* Then retrieve the actual target data.
@@ -326,7 +325,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
if (r < 0)
goto error;
- digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, GFP_KERNEL, noio);
+ digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, noio);
if (!digest_buf)
goto error;
@@ -370,18 +369,18 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap)
{
char *device_table_data, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL;
char active[] = "active_table_hash=";
- unsigned int active_len = strlen(active), capacity_len = 0;
+ unsigned int active_len = strlen(active);
unsigned int l = 0;
bool noio = true;
bool nodata = true;
- int r;
+ int capacity_len;
- device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio);
+ device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio);
if (!device_table_data)
return;
- r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
- if (r)
+ capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
+ if (capacity_len < 0)
goto error;
memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len);
@@ -444,18 +443,14 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap)
}
if (nodata) {
- r = dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio);
- if (r)
+ if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio))
goto error;
- scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN,
- "%sname=%s,uuid=%s;device_resume=no_data;",
- DM_IMA_VERSION_STR, dev_name, dev_uuid);
- l = strlen(device_table_data);
-
+ l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN,
+ "%sname=%s,uuid=%s;device_resume=no_data;",
+ DM_IMA_VERSION_STR, dev_name, dev_uuid);
}
- capacity_len = strlen(capacity_str);
memcpy(device_table_data + l, capacity_str, capacity_len);
l += capacity_len;
@@ -484,18 +479,17 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all)
unsigned int device_active_len = strlen(device_active_str);
unsigned int device_inactive_len = strlen(device_inactive_str);
unsigned int remove_all_len = strlen(remove_all_str);
- unsigned int capacity_len = 0;
unsigned int l = 0;
bool noio = true;
bool nodata = true;
- int r;
+ int capacity_len;
- device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, GFP_KERNEL, noio);
+ device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, noio);
if (!device_table_data)
goto exit;
- r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
- if (r) {
+ capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
+ if (capacity_len < 0) {
kfree(device_table_data);
goto exit;
}
@@ -561,10 +555,9 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all)
if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio))
goto error;
- scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN,
- "%sname=%s,uuid=%s;device_remove=no_data;",
- DM_IMA_VERSION_STR, dev_name, dev_uuid);
- l = strlen(device_table_data);
+ l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN,
+ "%sname=%s,uuid=%s;device_remove=no_data;",
+ DM_IMA_VERSION_STR, dev_name, dev_uuid);
}
memcpy(device_table_data + l, remove_all_str, remove_all_len);
@@ -572,7 +565,6 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all)
memcpy(device_table_data + l, remove_all ? "y;" : "n;", 2);
l += 2;
- capacity_len = strlen(capacity_str);
memcpy(device_table_data + l, capacity_str, capacity_len);
l += capacity_len;
@@ -604,20 +596,20 @@ exit:
*/
void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map)
{
- unsigned int l = 0, capacity_len = 0;
+ unsigned int l = 0;
char *device_table_data = NULL, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL;
char inactive_str[] = "inactive_table_hash=";
unsigned int inactive_len = strlen(inactive_str);
bool noio = true;
bool nodata = true;
- int r;
+ int capacity_len;
- device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio);
+ device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio);
if (!device_table_data)
return;
- r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
- if (r)
+ capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
+ if (capacity_len < 0)
goto error1;
memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len);
@@ -647,13 +639,11 @@ void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map)
if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio))
goto error2;
- scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN,
- "%sname=%s,uuid=%s;table_clear=no_data;",
- DM_IMA_VERSION_STR, dev_name, dev_uuid);
- l = strlen(device_table_data);
+ l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN,
+ "%sname=%s,uuid=%s;table_clear=no_data;",
+ DM_IMA_VERSION_STR, dev_name, dev_uuid);
}
- capacity_len = strlen(capacity_str);
memcpy(device_table_data + l, capacity_str, capacity_len);
l += capacity_len;
@@ -706,7 +696,7 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md)
char *old_device_data = NULL, *new_device_data = NULL, *combined_device_data = NULL;
char *new_dev_name = NULL, *new_dev_uuid = NULL, *capacity_str = NULL;
bool noio = true;
- int r;
+ int len;
if (dm_ima_alloc_and_copy_device_data(md, &new_device_data,
md->ima.active_table.num_targets, noio))
@@ -715,12 +705,11 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md)
if (dm_ima_alloc_and_copy_name_uuid(md, &new_dev_name, &new_dev_uuid, noio))
goto error;
- combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, GFP_KERNEL, noio);
+ combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, noio);
if (!combined_device_data)
goto error;
- r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
- if (r)
+ if (dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio) < 0)
goto error;
old_device_data = md->ima.active_table.device_metadata;
@@ -728,12 +717,11 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md)
md->ima.active_table.device_metadata = new_device_data;
md->ima.active_table.device_metadata_len = strlen(new_device_data);
- scnprintf(combined_device_data, DM_IMA_DEVICE_BUF_LEN * 2,
- "%s%snew_name=%s,new_uuid=%s;%s", DM_IMA_VERSION_STR, old_device_data,
- new_dev_name, new_dev_uuid, capacity_str);
+ len = scnprintf(combined_device_data, DM_IMA_DEVICE_BUF_LEN * 2,
+ "%s%snew_name=%s,new_uuid=%s;%s", DM_IMA_VERSION_STR, old_device_data,
+ new_dev_name, new_dev_uuid, capacity_str);
- dm_ima_measure_data("dm_device_rename", combined_device_data, strlen(combined_device_data),
- noio);
+ dm_ima_measure_data("dm_device_rename", combined_device_data, len, noio);
goto exit;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index ee9f7cecd78e..170bf67a2edd 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -21,6 +21,7 @@
#include <linux/reboot.h>
#include <crypto/hash.h>
#include <crypto/skcipher.h>
+#include <crypto/utils.h>
#include <linux/async_tx.h>
#include <linux/dm-bufio.h>
@@ -132,7 +133,7 @@ struct journal_sector {
commit_id_t commit_id;
};
-#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
+#define MAX_TAG_SIZE 255
#define METADATA_PADDING_SECTORS 8
@@ -218,10 +219,13 @@ struct dm_integrity_c {
__u8 log2_blocks_per_bitmap_bit;
unsigned char mode;
+ bool internal_hash;
int failed;
- struct crypto_shash *internal_hash;
+ struct crypto_shash *internal_shash;
+ struct crypto_ahash *internal_ahash;
+ unsigned int internal_hash_digestsize;
struct dm_target *ti;
@@ -276,6 +280,9 @@ struct dm_integrity_c {
bool fix_hmac;
bool legacy_recalculate;
+ mempool_t ahash_req_pool;
+ struct ahash_request *journal_ahash_req;
+
struct alg_spec internal_hash_alg;
struct alg_spec journal_crypt_alg;
struct alg_spec journal_mac_alg;
@@ -325,6 +332,8 @@ struct dm_integrity_io {
unsigned payload_len;
bool integrity_payload_from_mempool;
bool integrity_range_locked;
+
+ struct ahash_request *ahash_req;
};
struct journal_completion {
@@ -351,6 +360,7 @@ struct bitmap_block_status {
static struct kmem_cache *journal_io_cache;
#define JOURNAL_IO_MEMPOOL 32
+#define AHASH_MEMPOOL 32
#ifdef DEBUG_PRINT
#define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__)
@@ -516,7 +526,7 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr)
dm_integrity_io_error(ic, "crypto_shash_digest", r);
return r;
}
- if (memcmp(mac, actual_mac, mac_size)) {
+ if (crypto_memneq(mac, actual_mac, mac_size)) {
dm_integrity_io_error(ic, "superblock mac", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0);
return -EILSEQ;
@@ -859,7 +869,7 @@ static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool
if (likely(wr))
memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
else {
- if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
+ if (crypto_memneq(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
dm_integrity_io_error(ic, "journal mac", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0);
}
@@ -1401,10 +1411,9 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_
static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
unsigned int *metadata_offset, unsigned int total_size, int op)
{
-#define MAY_BE_FILLER 1
-#define MAY_BE_HASH 2
unsigned int hash_offset = 0;
- unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+ unsigned char mismatch_hash = 0;
+ unsigned char mismatch_filler = !ic->discard;
do {
unsigned char *data, *dp;
@@ -1425,7 +1434,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
if (op == TAG_READ) {
memcpy(tag, dp, to_copy);
} else if (op == TAG_WRITE) {
- if (memcmp(dp, tag, to_copy)) {
+ if (crypto_memneq(dp, tag, to_copy)) {
memcpy(dp, tag, to_copy);
dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
}
@@ -1433,29 +1442,30 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
/* e.g.: op == TAG_CMP */
if (likely(is_power_of_2(ic->tag_size))) {
- if (unlikely(memcmp(dp, tag, to_copy)))
- if (unlikely(!ic->discard) ||
- unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) {
- goto thorough_test;
- }
+ if (unlikely(crypto_memneq(dp, tag, to_copy)))
+ goto thorough_test;
} else {
unsigned int i, ts;
thorough_test:
ts = total_size;
for (i = 0; i < to_copy; i++, ts--) {
- if (unlikely(dp[i] != tag[i]))
- may_be &= ~MAY_BE_HASH;
- if (likely(dp[i] != DISCARD_FILLER))
- may_be &= ~MAY_BE_FILLER;
+ /*
+ * Warning: the control flow must not be
+ * dependent on match/mismatch of
+ * individual bytes.
+ */
+ mismatch_hash |= dp[i] ^ tag[i];
+ mismatch_filler |= dp[i] ^ DISCARD_FILLER;
hash_offset++;
if (unlikely(hash_offset == ic->tag_size)) {
- if (unlikely(!may_be)) {
+ if (unlikely(mismatch_hash) && unlikely(mismatch_filler)) {
dm_bufio_release(b);
return ts;
}
hash_offset = 0;
- may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+ mismatch_hash = 0;
+ mismatch_filler = !ic->discard;
}
}
}
@@ -1476,8 +1486,6 @@ thorough_test:
} while (unlikely(total_size));
return 0;
-#undef MAY_BE_FILLER
-#undef MAY_BE_HASH
}
struct flush_request {
@@ -1541,7 +1549,8 @@ static void sleep_on_endio_wait(struct dm_integrity_c *ic)
static void autocommit_fn(struct timer_list *t)
{
- struct dm_integrity_c *ic = from_timer(ic, t, autocommit_timer);
+ struct dm_integrity_c *ic = timer_container_of(ic, t,
+ autocommit_timer);
if (likely(!dm_integrity_failed(ic)))
queue_work(ic->commit_wq, &ic->commit_work);
@@ -1634,15 +1643,15 @@ static void integrity_end_io(struct bio *bio)
dec_in_flight(dio);
}
-static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
- const char *data, char *result)
+static void integrity_sector_checksum_shash(struct dm_integrity_c *ic, sector_t sector,
+ const char *data, unsigned offset, char *result)
{
__le64 sector_le = cpu_to_le64(sector);
- SHASH_DESC_ON_STACK(req, ic->internal_hash);
+ SHASH_DESC_ON_STACK(req, ic->internal_shash);
int r;
unsigned int digest_size;
- req->tfm = ic->internal_hash;
+ req->tfm = ic->internal_shash;
r = crypto_shash_init(req);
if (unlikely(r < 0)) {
@@ -1664,7 +1673,7 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector
goto failed;
}
- r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
+ r = crypto_shash_update(req, data + offset, ic->sectors_per_block << SECTOR_SHIFT);
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_update", r);
goto failed;
@@ -1676,7 +1685,70 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector
goto failed;
}
- digest_size = crypto_shash_digestsize(ic->internal_hash);
+ digest_size = ic->internal_hash_digestsize;
+ if (unlikely(digest_size < ic->tag_size))
+ memset(result + digest_size, 0, ic->tag_size - digest_size);
+
+ return;
+
+failed:
+ /* this shouldn't happen anyway, the hash functions have no reason to fail */
+ get_random_bytes(result, ic->tag_size);
+}
+
+static void integrity_sector_checksum_ahash(struct dm_integrity_c *ic, struct ahash_request **ahash_req,
+ sector_t sector, struct page *page, unsigned offset, char *result)
+{
+ __le64 sector_le = cpu_to_le64(sector);
+ struct ahash_request *req;
+ DECLARE_CRYPTO_WAIT(wait);
+ struct scatterlist sg[3], *s = sg;
+ int r;
+ unsigned int digest_size;
+ unsigned int nbytes = 0;
+
+ might_sleep();
+
+ req = *ahash_req;
+ if (unlikely(!req)) {
+ req = mempool_alloc(&ic->ahash_req_pool, GFP_NOIO);
+ *ahash_req = req;
+ }
+
+ ahash_request_set_tfm(req, ic->internal_ahash);
+ ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait);
+
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
+ sg_init_table(sg, 3);
+ sg_set_buf(s, (const __u8 *)&ic->sb->salt, SALT_SIZE);
+ nbytes += SALT_SIZE;
+ s++;
+ } else {
+ sg_init_table(sg, 2);
+ }
+
+ if (likely(!is_vmalloc_addr(&sector_le))) {
+ sg_set_buf(s, &sector_le, sizeof(sector_le));
+ } else {
+ struct page *sec_page = vmalloc_to_page(&sector_le);
+ unsigned int sec_off = offset_in_page(&sector_le);
+ sg_set_page(s, sec_page, sizeof(sector_le), sec_off);
+ }
+ nbytes += sizeof(sector_le);
+ s++;
+
+ sg_set_page(s, page, ic->sectors_per_block << SECTOR_SHIFT, offset);
+ nbytes += ic->sectors_per_block << SECTOR_SHIFT;
+
+ ahash_request_set_crypt(req, sg, result, nbytes);
+
+ r = crypto_wait_req(crypto_ahash_digest(req), &wait);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "crypto_ahash_digest", r);
+ goto failed;
+ }
+
+ digest_size = ic->internal_hash_digestsize;
if (unlikely(digest_size < ic->tag_size))
memset(result + digest_size, 0, ic->tag_size - digest_size);
@@ -1687,6 +1759,41 @@ failed:
get_random_bytes(result, ic->tag_size);
}
+static void integrity_sector_checksum(struct dm_integrity_c *ic, struct ahash_request **ahash_req,
+ sector_t sector, const char *data, unsigned offset, char *result)
+{
+ if (likely(ic->internal_shash != NULL))
+ integrity_sector_checksum_shash(ic, sector, data, offset, result);
+ else
+ integrity_sector_checksum_ahash(ic, ahash_req, sector, (struct page *)data, offset, result);
+}
+
+static void *integrity_kmap(struct dm_integrity_c *ic, struct page *p)
+{
+ if (likely(ic->internal_shash != NULL))
+ return kmap_local_page(p);
+ else
+ return p;
+}
+
+static void integrity_kunmap(struct dm_integrity_c *ic, const void *ptr)
+{
+ if (likely(ic->internal_shash != NULL))
+ kunmap_local(ptr);
+}
+
+static void *integrity_identity(struct dm_integrity_c *ic, void *data)
+{
+#ifdef CONFIG_DEBUG_SG
+ BUG_ON(offset_in_page(data));
+ BUG_ON(!virt_addr_valid(data));
+#endif
+ if (likely(ic->internal_shash != NULL))
+ return data;
+ else
+ return virt_to_page(data);
+}
+
static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checksum)
{
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
@@ -1711,6 +1818,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
sector_t alignment;
char *mem;
char *buffer = page_to_virt(page);
+ unsigned int buffer_offset;
int r;
struct dm_io_request io_req;
struct dm_io_region io_loc;
@@ -1728,7 +1836,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
alignment &= -alignment;
io_loc.sector = round_down(io_loc.sector, alignment);
io_loc.count += sector - io_loc.sector;
- buffer += (sector - io_loc.sector) << SECTOR_SHIFT;
+ buffer_offset = (sector - io_loc.sector) << SECTOR_SHIFT;
io_loc.count = round_up(io_loc.count, alignment);
r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
@@ -1737,7 +1845,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
goto free_ret;
}
- integrity_sector_checksum(ic, logical_sector, buffer, checksum);
+ integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, integrity_identity(ic, buffer), buffer_offset, checksum);
r = dm_integrity_rw_tag(ic, checksum, &dio->metadata_block,
&dio->metadata_offset, ic->tag_size, TAG_CMP);
if (r) {
@@ -1754,7 +1862,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
}
mem = bvec_kmap_local(&bv);
- memcpy(mem + pos, buffer, ic->sectors_per_block << SECTOR_SHIFT);
+ memcpy(mem + pos, buffer + buffer_offset, ic->sectors_per_block << SECTOR_SHIFT);
kunmap_local(mem);
pos += ic->sectors_per_block << SECTOR_SHIFT;
@@ -1776,7 +1884,7 @@ static void integrity_metadata(struct work_struct *w)
if (ic->internal_hash) {
struct bvec_iter iter;
struct bio_vec bv;
- unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash);
+ unsigned int digest_size = ic->internal_hash_digestsize;
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
char *checksums;
unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
@@ -1837,17 +1945,17 @@ static void integrity_metadata(struct work_struct *w)
char *mem, *checksums_ptr;
again:
- mem = bvec_kmap_local(&bv_copy);
+ mem = integrity_kmap(ic, bv_copy.bv_page);
pos = 0;
checksums_ptr = checksums;
do {
- integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
+ integrity_sector_checksum(ic, &dio->ahash_req, sector, mem, bv_copy.bv_offset + pos, checksums_ptr);
checksums_ptr += ic->tag_size;
sectors_to_process -= ic->sectors_per_block;
pos += ic->sectors_per_block << SECTOR_SHIFT;
sector += ic->sectors_per_block;
} while (pos < bv_copy.bv_len && sectors_to_process && checksums != checksums_onstack);
- kunmap_local(mem);
+ integrity_kunmap(ic, mem);
r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
@@ -1949,6 +2057,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
dio->ic = ic;
dio->bi_status = 0;
dio->op = bio_op(bio);
+ dio->ahash_req = NULL;
if (ic->mode == 'I') {
bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector);
@@ -2071,19 +2180,6 @@ retry_kmap:
js++;
mem_ptr += 1 << SECTOR_SHIFT;
} while (++s < ic->sectors_per_block);
-#ifdef INTERNAL_VERIFY
- if (ic->internal_hash) {
- char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
-
- integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
- if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
- DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
- logical_sector);
- dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum",
- bio, logical_sector, 0);
- }
- }
-#endif
}
if (!ic->internal_hash) {
@@ -2124,15 +2220,17 @@ retry_kmap:
} while (++s < ic->sectors_per_block);
if (ic->internal_hash) {
- unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash);
+ unsigned int digest_size = ic->internal_hash_digestsize;
+ void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js));
+ unsigned js_offset = offset_in_page(js);
if (unlikely(digest_size > ic->tag_size)) {
char checksums_onstack[HASH_MAX_DIGESTSIZE];
- integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
+ integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, checksums_onstack);
memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
} else
- integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
+ integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, journal_entry_tag(ic, je));
}
journal_entry_set_sector(je, logical_sector);
@@ -2428,7 +2526,7 @@ retry:
if (!dio->integrity_payload) {
unsigned digest_size, extra_size;
dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block);
- digest_size = crypto_shash_digestsize(ic->internal_hash);
+ digest_size = ic->internal_hash_digestsize;
extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
dio->payload_len += extra_size;
dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -2505,11 +2603,11 @@ skip_spinlock:
unsigned pos = 0;
while (dio->bio_details.bi_iter.bi_size) {
struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
- const char *mem = bvec_kmap_local(&bv);
+ const char *mem = integrity_kmap(ic, bv.bv_page);
if (ic->tag_size < ic->tuple_size)
memset(dio->integrity_payload + pos + ic->tag_size, 0, ic->tuple_size - ic->tuple_size);
- integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, dio->integrity_payload + pos);
- kunmap_local(mem);
+ integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, dio->integrity_payload + pos);
+ integrity_kunmap(ic, mem);
pos += ic->tuple_size;
bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
}
@@ -2558,14 +2656,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
char *mem;
outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios);
-
- r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0);
- if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) {
- bio_put(outgoing_bio);
- bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(bio);
- return;
- }
+ bio_add_virt_nofail(outgoing_bio, outgoing_data,
+ ic->sectors_per_block << SECTOR_SHIFT);
bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1);
if (IS_ERR(bip)) {
@@ -2594,8 +2686,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
}
bio_put(outgoing_bio);
- integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest);
- if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
+ integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, integrity_identity(ic, outgoing_data), 0, digest);
+ if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(ic->internal_hash_digestsize, ic->tag_size)))) {
DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
ic->dev->bdev, dio->bio_details.bi_iter.bi_sector);
atomic64_inc(&ic->number_of_mismatches);
@@ -2618,33 +2710,58 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
bio_endio(bio);
}
+static inline bool dm_integrity_check(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
+{
+ struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+ unsigned pos = 0;
+
+ while (dio->bio_details.bi_iter.bi_size) {
+ char digest[HASH_MAX_DIGESTSIZE];
+ struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
+ char *mem = integrity_kmap(ic, bv.bv_page);
+ integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, digest);
+ if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos,
+ min(ic->internal_hash_digestsize, ic->tag_size)))) {
+ integrity_kunmap(ic, mem);
+ dm_integrity_free_payload(dio);
+ INIT_WORK(&dio->work, dm_integrity_inline_recheck);
+ queue_work(ic->offload_wq, &dio->work);
+ return false;
+ }
+ integrity_kunmap(ic, mem);
+ pos += ic->tuple_size;
+ bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
+ }
+
+ return true;
+}
+
+static void dm_integrity_inline_async_check(struct work_struct *w)
+{
+ struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
+ struct dm_integrity_c *ic = dio->ic;
+ struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+
+ if (likely(dm_integrity_check(ic, dio)))
+ bio_endio(bio);
+}
+
static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
{
struct dm_integrity_c *ic = ti->private;
+ struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
if (ic->mode == 'I') {
- struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
- if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) {
- unsigned pos = 0;
+ if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK) && likely(dio->bio_details.bi_iter.bi_size != 0)) {
if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
unlikely(dio->integrity_range_locked))
- goto skip_check;
- while (dio->bio_details.bi_iter.bi_size) {
- char digest[HASH_MAX_DIGESTSIZE];
- struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
- char *mem = bvec_kmap_local(&bv);
- //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT);
- integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest);
- if (unlikely(memcmp(digest, dio->integrity_payload + pos,
- min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
- kunmap_local(mem);
- dm_integrity_free_payload(dio);
- INIT_WORK(&dio->work, dm_integrity_inline_recheck);
- queue_work(ic->offload_wq, &dio->work);
+ goto skip_check;
+ if (likely(ic->internal_shash != NULL)) {
+ if (unlikely(!dm_integrity_check(ic, dio)))
return DM_ENDIO_INCOMPLETE;
- }
- kunmap_local(mem);
- pos += ic->tuple_size;
- bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
+ } else {
+ INIT_WORK(&dio->work, dm_integrity_inline_async_check);
+ queue_work(ic->offload_wq, &dio->work);
+ return DM_ENDIO_INCOMPLETE;
}
}
skip_check:
@@ -2652,6 +2769,8 @@ skip_check:
if (unlikely(dio->integrity_range_locked))
remove_range(ic, &dio->range);
}
+ if (unlikely(dio->ahash_req))
+ mempool_free(dio->ahash_req, &ic->ahash_req_pool);
return DM_ENDIO_DONE;
}
@@ -2708,7 +2827,7 @@ static void integrity_commit(struct work_struct *w)
unsigned int i, j, n;
struct bio *flushes;
- del_timer(&ic->autocommit_timer);
+ timer_delete(&ic->autocommit_timer);
if (ic->mode == 'I')
return;
@@ -2908,10 +3027,13 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start
#endif
ic->internal_hash) {
char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
+ struct journal_sector *js = access_journal_data(ic, i, l);
+ void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js));
+ unsigned js_offset = offset_in_page(js);
- integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
- (char *)access_journal_data(ic, i, l), test_tag);
- if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
+ integrity_sector_checksum(ic, &ic->journal_ahash_req, sec + ((l - j) << ic->sb->log2_sectors_per_block),
+ js_page, js_offset, test_tag);
+ if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0);
}
@@ -2993,6 +3115,7 @@ static void integrity_recalc(struct work_struct *w)
size_t recalc_tags_size;
u8 *recalc_buffer = NULL;
u8 *recalc_tags = NULL;
+ struct ahash_request *ahash_req = NULL;
struct dm_integrity_range range;
struct dm_io_request io_req;
struct dm_io_region io_loc;
@@ -3007,7 +3130,7 @@ static void integrity_recalc(struct work_struct *w)
unsigned recalc_sectors = RECALC_SECTORS;
retry:
- recalc_buffer = __vmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO);
+ recalc_buffer = kmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO | __GFP_NOWARN);
if (!recalc_buffer) {
oom:
recalc_sectors >>= 1;
@@ -3017,11 +3140,11 @@ oom:
goto free_ret;
}
recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
- if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
- recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
+ if (ic->internal_hash_digestsize > ic->tag_size)
+ recalc_tags_size += ic->internal_hash_digestsize - ic->tag_size;
recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO);
if (!recalc_tags) {
- vfree(recalc_buffer);
+ kfree(recalc_buffer);
recalc_buffer = NULL;
goto oom;
}
@@ -3087,7 +3210,7 @@ next_chunk:
goto err;
io_req.bi_opf = REQ_OP_READ;
- io_req.mem.type = DM_IO_VMA;
+ io_req.mem.type = DM_IO_KMEM;
io_req.mem.ptr.addr = recalc_buffer;
io_req.notify.fn = NULL;
io_req.client = ic->io;
@@ -3103,7 +3226,10 @@ next_chunk:
t = recalc_tags;
for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
- integrity_sector_checksum(ic, logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
+ void *ptr = recalc_buffer + (i << SECTOR_SHIFT);
+ void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr));
+ unsigned ptr_offset = offset_in_page(ptr);
+ integrity_sector_checksum(ic, &ahash_req, logical_sector + i, ptr_page, ptr_offset, t);
t += ic->tag_size;
}
@@ -3145,8 +3271,9 @@ unlock_ret:
recalc_write_super(ic);
free_ret:
- vfree(recalc_buffer);
+ kfree(recalc_buffer);
kvfree(recalc_tags);
+ mempool_free(ahash_req, &ic->ahash_req_pool);
}
static void integrity_recalc_inline(struct work_struct *w)
@@ -3155,6 +3282,7 @@ static void integrity_recalc_inline(struct work_struct *w)
size_t recalc_tags_size;
u8 *recalc_buffer = NULL;
u8 *recalc_tags = NULL;
+ struct ahash_request *ahash_req = NULL;
struct dm_integrity_range range;
struct bio *bio;
struct bio_integrity_payload *bip;
@@ -3177,8 +3305,8 @@ oom:
}
recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size;
- if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size)
- recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size;
+ if (ic->internal_hash_digestsize > ic->tuple_size)
+ recalc_tags_size += ic->internal_hash_digestsize - ic->tuple_size;
recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN);
if (!recalc_tags) {
kfree(recalc_buffer);
@@ -3212,7 +3340,8 @@ next_chunk:
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
- __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+ bio_add_virt_nofail(bio, recalc_buffer,
+ range.n_sectors << SECTOR_SHIFT);
r = submit_bio_wait(bio);
bio_put(bio);
if (unlikely(r)) {
@@ -3222,14 +3351,18 @@ next_chunk:
t = recalc_tags;
for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
+ void *ptr = recalc_buffer + (i << SECTOR_SHIFT);
+ void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr));
+ unsigned ptr_offset = offset_in_page(ptr);
memset(t, 0, ic->tuple_size);
- integrity_sector_checksum(ic, range.logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
+ integrity_sector_checksum(ic, &ahash_req, range.logical_sector + i, ptr_page, ptr_offset, t);
t += ic->tuple_size;
}
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
- __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+ bio_add_virt_nofail(bio, recalc_buffer,
+ range.n_sectors << SECTOR_SHIFT);
bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
if (unlikely(IS_ERR(bip))) {
@@ -3274,6 +3407,7 @@ unlock_ret:
free_ret:
kfree(recalc_buffer);
kfree(recalc_tags);
+ mempool_free(ahash_req, &ic->ahash_req_pool);
}
static void bitmap_block_work(struct work_struct *w)
@@ -3607,7 +3741,7 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier));
- del_timer_sync(&ic->autocommit_timer);
+ timer_delete_sync(&ic->autocommit_timer);
if (ic->recalc_wq)
drain_workqueue(ic->recalc_wq);
@@ -3790,20 +3924,18 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE: {
- __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
-
- watermark_percentage += ic->journal_entries / 2;
- do_div(watermark_percentage, ic->journal_entries);
- arg_count = 3;
+ arg_count = 1; /* buffer_sectors */
arg_count += !!ic->meta_dev;
arg_count += ic->sectors_per_block != 1;
arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
arg_count += ic->reset_recalculate_flag;
arg_count += ic->discard;
- arg_count += ic->mode == 'J';
- arg_count += ic->mode == 'J';
- arg_count += ic->mode == 'B';
- arg_count += ic->mode == 'B';
+ arg_count += ic->mode != 'I'; /* interleave_sectors */
+ arg_count += ic->mode == 'J'; /* journal_sectors */
+ arg_count += ic->mode == 'J'; /* journal_watermark */
+ arg_count += ic->mode == 'J'; /* commit_time */
+ arg_count += ic->mode == 'B'; /* sectors_per_bit */
+ arg_count += ic->mode == 'B'; /* bitmap_flush_interval */
arg_count += !!ic->internal_hash_alg.alg_string;
arg_count += !!ic->journal_crypt_alg.alg_string;
arg_count += !!ic->journal_mac_alg.alg_string;
@@ -3822,10 +3954,15 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
DMEMIT(" reset_recalculate");
if (ic->discard)
DMEMIT(" allow_discards");
- DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
- DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
+ if (ic->mode != 'I')
+ DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
if (ic->mode == 'J') {
+ __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
+
+ watermark_percentage += ic->journal_entries / 2;
+ do_div(watermark_percentage, ic->journal_entries);
+ DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
DMEMIT(" journal_watermark:%u", (unsigned int)watermark_percentage);
DMEMIT(" commit_time:%u", ic->autocommit_msec);
}
@@ -3907,8 +4044,8 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim
struct blk_integrity *bi = &limits->integrity;
memset(bi, 0, sizeof(*bi));
- bi->tuple_size = ic->tag_size;
- bi->tag_size = bi->tuple_size;
+ bi->metadata_size = ic->tag_size;
+ bi->tag_size = bi->metadata_size;
bi->interval_exp =
ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
}
@@ -4211,30 +4348,53 @@ nomem:
return -ENOMEM;
}
-static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
- char *error_alg, char *error_key)
+static int get_mac(struct crypto_shash **shash, struct crypto_ahash **ahash,
+ struct alg_spec *a, char **error, char *error_alg, char *error_key)
{
int r;
if (a->alg_string) {
- *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
- if (IS_ERR(*hash)) {
- *error = error_alg;
- r = PTR_ERR(*hash);
- *hash = NULL;
- return r;
- }
-
- if (a->key) {
- r = crypto_shash_setkey(*hash, a->key, a->key_size);
- if (r) {
+ if (shash) {
+ *shash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
+ if (IS_ERR(*shash)) {
+ *shash = NULL;
+ goto try_ahash;
+ }
+ if (a->key) {
+ r = crypto_shash_setkey(*shash, a->key, a->key_size);
+ if (r) {
+ *error = error_key;
+ return r;
+ }
+ } else if (crypto_shash_get_flags(*shash) & CRYPTO_TFM_NEED_KEY) {
*error = error_key;
+ return -ENOKEY;
+ }
+ return 0;
+ }
+try_ahash:
+ if (ahash) {
+ *ahash = crypto_alloc_ahash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
+ if (IS_ERR(*ahash)) {
+ *error = error_alg;
+ r = PTR_ERR(*ahash);
+ *ahash = NULL;
return r;
}
- } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) {
- *error = error_key;
- return -ENOKEY;
+ if (a->key) {
+ r = crypto_ahash_setkey(*ahash, a->key, a->key_size);
+ if (r) {
+ *error = error_key;
+ return r;
+ }
+ } else if (crypto_ahash_get_flags(*ahash) & CRYPTO_TFM_NEED_KEY) {
+ *error = error_key;
+ return -ENOKEY;
+ }
+ return 0;
}
+ *error = error_alg;
+ return -ENOENT;
}
return 0;
@@ -4691,12 +4851,26 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
buffer_sectors = 1;
ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT);
- r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
+ r = get_mac(&ic->internal_shash, &ic->internal_ahash, &ic->internal_hash_alg, &ti->error,
"Invalid internal hash", "Error setting internal hash key");
if (r)
goto bad;
+ if (ic->internal_shash) {
+ ic->internal_hash = true;
+ ic->internal_hash_digestsize = crypto_shash_digestsize(ic->internal_shash);
+ }
+ if (ic->internal_ahash) {
+ ic->internal_hash = true;
+ ic->internal_hash_digestsize = crypto_ahash_digestsize(ic->internal_ahash);
+ r = mempool_init_kmalloc_pool(&ic->ahash_req_pool, AHASH_MEMPOOL,
+ sizeof(struct ahash_request) + crypto_ahash_reqsize(ic->internal_ahash));
+ if (r) {
+ ti->error = "Cannot allocate mempool";
+ goto bad;
+ }
+ }
- r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
+ r = get_mac(&ic->journal_mac, NULL, &ic->journal_mac_alg, &ti->error,
"Invalid journal mac", "Error setting journal mac key");
if (r)
goto bad;
@@ -4707,7 +4881,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
r = -EINVAL;
goto bad;
}
- ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
+ ic->tag_size = ic->internal_hash_digestsize;
}
if (ic->tag_size > MAX_TAG_SIZE) {
ti->error = "Too big tag size";
@@ -4747,18 +4921,18 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
ti->error = "Integrity profile not supported";
goto bad;
}
- /*printk("tag_size: %u, tuple_size: %u\n", bi->tag_size, bi->tuple_size);*/
- if (bi->tuple_size < ic->tag_size) {
+ /*printk("tag_size: %u, metadata_size: %u\n", bi->tag_size, bi->metadata_size);*/
+ if (bi->metadata_size < ic->tag_size) {
r = -EINVAL;
ti->error = "The integrity profile is smaller than tag size";
goto bad;
}
- if ((unsigned long)bi->tuple_size > PAGE_SIZE / 2) {
+ if ((unsigned long)bi->metadata_size > PAGE_SIZE / 2) {
r = -EINVAL;
ti->error = "Too big tuple size";
goto bad;
}
- ic->tuple_size = bi->tuple_size;
+ ic->tuple_size = bi->metadata_size;
if (1 << bi->interval_exp != ic->sectors_per_block << SECTOR_SHIFT) {
r = -EINVAL;
ti->error = "Integrity profile sector size mismatch";
@@ -4808,23 +4982,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
ti->error = "Cannot allocate bio set";
goto bad;
}
- r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE);
- if (r) {
- ti->error = "Cannot allocate bio integrity set";
- r = -ENOMEM;
- goto bad;
- }
r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS);
if (r) {
ti->error = "Cannot allocate bio set";
goto bad;
}
- r = bioset_integrity_create(&ic->recalc_bios, 1);
- if (r) {
- ti->error = "Cannot allocate bio integrity set";
- r = -ENOMEM;
- goto bad;
- }
}
ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
@@ -5081,16 +5243,19 @@ try_smaller_buffer:
ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
if (!ic->recalc_bitmap) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
if (!ic->may_write_bitmap) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
if (!ic->bbs) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
@@ -5171,7 +5336,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
BUG_ON(!list_empty(&ic->wait_list));
- if (ic->mode == 'B')
+ if (ic->mode == 'B' && ic->bitmap_flush_work.work.func)
cancel_delayed_work_sync(&ic->bitmap_flush_work);
if (ic->metadata_wq)
destroy_workqueue(ic->metadata_wq);
@@ -5188,6 +5353,8 @@ static void dm_integrity_dtr(struct dm_target *ti)
kvfree(ic->bbs);
if (ic->bufio)
dm_bufio_client_destroy(ic->bufio);
+ mempool_free(ic->journal_ahash_req, &ic->ahash_req_pool);
+ mempool_exit(&ic->ahash_req_pool);
bioset_exit(&ic->recalc_bios);
bioset_exit(&ic->recheck_bios);
mempool_exit(&ic->recheck_pool);
@@ -5225,8 +5392,10 @@ static void dm_integrity_dtr(struct dm_target *ti)
if (ic->sb)
free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
- if (ic->internal_hash)
- crypto_free_shash(ic->internal_hash);
+ if (ic->internal_shash)
+ crypto_free_shash(ic->internal_shash);
+ if (ic->internal_ahash)
+ crypto_free_ahash(ic->internal_ahash);
free_alg(&ic->internal_hash_alg);
if (ic->journal_crypt)
@@ -5243,7 +5412,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
static struct target_type integrity_target = {
.name = "integrity",
- .version = {1, 13, 0},
+ .version = {1, 14, 0},
.module = THIS_MODULE,
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
.ctr = dm_integrity_ctr,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index d7a8e2f40db3..c37668790577 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -379,6 +379,7 @@ static void do_region(const blk_opf_t opf, unsigned int region,
atomic_inc(&io->count);
submit_bio(bio);
+ WARN_ON_ONCE(opf & REQ_ATOMIC && remaining);
} while (remaining);
}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index d42eac944eb5..4165fef4c170 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1885,6 +1885,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
{DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry},
{DM_DEV_ARM_POLL_CMD, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll},
{DM_GET_TARGET_VERSION_CMD, 0, get_target_version},
+ {DM_MPATH_PROBE_PATHS_CMD, 0, NULL}, /* block device ioctl */
};
if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 49fb0f684193..73bf290af181 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -119,7 +119,9 @@ static void linear_status(struct dm_target *ti, status_type_t type,
}
}
-static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct linear_c *lc = ti->private;
struct dm_dev *dev = lc->dev;
@@ -168,7 +170,7 @@ static struct dax_device *linear_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff)
static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, enum dax_access_mode mode, void **kaddr,
- pfn_t *pfn)
+ unsigned long *pfn)
{
struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);
@@ -199,9 +201,10 @@ static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff,
static struct target_type linear_target = {
.name = "linear",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
- DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO,
+ DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO |
+ DM_TARGET_ATOMIC_WRITES,
.report_zones = linear_report_zones,
.module = THIS_MODULE,
.ctr = linear_ctr,
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 8d7df8303d0a..7bb7174f8f4f 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -414,7 +414,7 @@ static int log_super(struct log_writes_c *lc)
}
/*
- * Super sector should be writen in-order, otherwise the
+ * Super sector should be written in-order, otherwise the
* nr_entries could be rewritten incorrectly by an old bio.
*/
wait_for_completion_io(&lc->super_done);
@@ -818,7 +818,9 @@ static void log_writes_status(struct dm_target *ti, status_type_t type,
}
static int log_writes_prepare_ioctl(struct dm_target *ti,
- struct block_device **bdev)
+ struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct log_writes_c *lc = ti->private;
struct dm_dev *dev = lc->dev;
@@ -891,7 +893,7 @@ static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti,
static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, enum dax_access_mode mode, void **kaddr,
- pfn_t *pfn)
+ unsigned long *pfn)
{
struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 637977acc3dc..aaf4a0a4b0eb 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -79,6 +79,7 @@ struct multipath {
struct pgpath *current_pgpath;
struct priority_group *current_pg;
struct priority_group *next_pg; /* Switch to this PG if set */
+ struct priority_group *last_probed_pg;
atomic_t nr_valid_paths; /* Total number of usable paths */
unsigned int nr_priority_groups;
@@ -87,6 +88,7 @@ struct multipath {
const char *hw_handler_name;
char *hw_handler_params;
wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+ wait_queue_head_t probe_wait; /* Wait for probing paths */
unsigned int pg_init_retries; /* Number of times to retry pg_init */
unsigned int pg_init_delay_msecs; /* Number of msecs before pg_init retry */
atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */
@@ -100,6 +102,7 @@ struct multipath {
struct bio_list queued_bios;
struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
+ bool is_suspending;
};
/*
@@ -132,6 +135,8 @@ static void queue_if_no_path_timeout_work(struct timer_list *t);
#define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */
#define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */
#define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */
+#define MPATHF_DELAY_PG_SWITCH 7 /* Delay switching pg if it still has paths */
+#define MPATHF_NEED_PG_SWITCH 8 /* Need to switch pgs after the delay has ended */
static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m)
{
@@ -254,6 +259,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
atomic_set(&m->pg_init_count, 0);
m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
init_waitqueue_head(&m->pg_init_wait);
+ init_waitqueue_head(&m->probe_wait);
return 0;
}
@@ -413,13 +419,21 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
goto failed;
}
+ /* Don't change PG until it has no remaining paths */
+ pg = READ_ONCE(m->current_pg);
+ if (pg) {
+ pgpath = choose_path_in_pg(m, pg, nr_bytes);
+ if (!IS_ERR_OR_NULL(pgpath))
+ return pgpath;
+ }
+
/* Were we instructed to switch PG? */
if (READ_ONCE(m->next_pg)) {
spin_lock_irqsave(&m->lock, flags);
pg = m->next_pg;
if (!pg) {
spin_unlock_irqrestore(&m->lock, flags);
- goto check_current_pg;
+ goto check_all_pgs;
}
m->next_pg = NULL;
spin_unlock_irqrestore(&m->lock, flags);
@@ -427,16 +441,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
if (!IS_ERR_OR_NULL(pgpath))
return pgpath;
}
-
- /* Don't change PG until it has no remaining paths */
-check_current_pg:
- pg = READ_ONCE(m->current_pg);
- if (pg) {
- pgpath = choose_path_in_pg(m, pg, nr_bytes);
- if (!IS_ERR_OR_NULL(pgpath))
- return pgpath;
- }
-
+check_all_pgs:
/*
* Loop through priority groups until we find a valid path.
* First time we skip PGs marked 'bypassed'.
@@ -612,7 +617,6 @@ static void multipath_queue_bio(struct multipath *m, struct bio *bio)
static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
{
struct pgpath *pgpath;
- unsigned long flags;
/* Do we need to select a new pgpath? */
pgpath = READ_ONCE(m->current_pgpath);
@@ -620,12 +624,12 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
if (!pgpath) {
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
__multipath_queue_bio(m, bio);
pgpath = ERR_PTR(-EAGAIN);
}
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
} else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
@@ -688,7 +692,6 @@ static void process_queued_io_list(struct multipath *m)
static void process_queued_bios(struct work_struct *work)
{
int r;
- unsigned long flags;
struct bio *bio;
struct bio_list bios;
struct blk_plug plug;
@@ -697,16 +700,16 @@ static void process_queued_bios(struct work_struct *work)
bio_list_init(&bios);
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (bio_list_empty(&m->queued_bios)) {
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
return;
}
bio_list_merge_init(&bios, &m->queued_bios);
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bios))) {
@@ -787,7 +790,7 @@ static int queue_if_no_path(struct multipath *m, bool f_queue_if_no_path,
*/
static void queue_if_no_path_timeout_work(struct timer_list *t)
{
- struct multipath *m = from_timer(m, t, nopath_timer);
+ struct multipath *m = timer_container_of(m, t, nopath_timer);
DMWARN("queue_if_no_path timeout on %s, failing queued IO",
dm_table_device_name(m->ti->table));
@@ -815,7 +818,7 @@ static void enable_nopath_timeout(struct multipath *m)
static void disable_nopath_timeout(struct multipath *m)
{
- del_timer_sync(&m->nopath_timer);
+ timer_delete_sync(&m->nopath_timer);
}
/*
@@ -1190,7 +1193,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv)
struct dm_arg_set as;
unsigned int pg_count = 0;
unsigned int next_pg_num;
- unsigned long flags;
as.argc = argc;
as.argv = argv;
@@ -1255,9 +1257,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
enable_nopath_timeout(m);
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
@@ -1292,23 +1294,21 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
static void flush_multipath_work(struct multipath *m)
{
if (m->hw_handler_name) {
- unsigned long flags;
-
if (!atomic_read(&m->pg_init_in_progress))
goto skip;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (atomic_read(&m->pg_init_in_progress) &&
!test_and_set_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) {
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
flush_workqueue(kmpath_handlerd);
multipath_wait_for_pg_init_completion(m);
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
}
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
}
skip:
if (m->queue_mode == DM_TYPE_BIO_BASED)
@@ -1370,11 +1370,10 @@ out:
static int reinstate_path(struct pgpath *pgpath)
{
int r = 0, run_queue = 0;
- unsigned long flags;
struct multipath *m = pgpath->pg->m;
unsigned int nr_valid_paths;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (pgpath->is_active)
goto out;
@@ -1404,7 +1403,7 @@ static int reinstate_path(struct pgpath *pgpath)
schedule_work(&m->trigger_event);
out:
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
if (run_queue) {
dm_table_run_md_queue_async(m->ti->table);
process_queued_io_list(m);
@@ -1439,15 +1438,19 @@ static int action_dev(struct multipath *m, dev_t dev, action_fn action)
* Temporarily try to avoid having to use the specified PG
*/
static void bypass_pg(struct multipath *m, struct priority_group *pg,
- bool bypassed)
+ bool bypassed, bool can_be_delayed)
{
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
pg->bypassed = bypassed;
- m->current_pgpath = NULL;
- m->current_pg = NULL;
+ if (can_be_delayed && test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags))
+ set_bit(MPATHF_NEED_PG_SWITCH, &m->flags);
+ else {
+ m->current_pgpath = NULL;
+ m->current_pg = NULL;
+ }
spin_unlock_irqrestore(&m->lock, flags);
@@ -1461,7 +1464,6 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
{
struct priority_group *pg;
unsigned int pgnum;
- unsigned long flags;
char dummy;
if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
@@ -1470,17 +1472,21 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
return -EINVAL;
}
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
list_for_each_entry(pg, &m->priority_groups, list) {
pg->bypassed = false;
if (--pgnum)
continue;
- m->current_pgpath = NULL;
- m->current_pg = NULL;
+ if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags))
+ set_bit(MPATHF_NEED_PG_SWITCH, &m->flags);
+ else {
+ m->current_pgpath = NULL;
+ m->current_pg = NULL;
+ }
m->next_pg = pg;
}
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
schedule_work(&m->trigger_event);
return 0;
@@ -1507,7 +1513,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
break;
}
- bypass_pg(m, pg, bypassed);
+ bypass_pg(m, pg, bypassed, true);
return 0;
}
@@ -1561,7 +1567,7 @@ static void pg_init_done(void *data, int errors)
* Probably doing something like FW upgrade on the
* controller so try the other pg.
*/
- bypass_pg(m, pg, true);
+ bypass_pg(m, pg, true, false);
break;
case SCSI_DH_RETRY:
/* Wait before retrying. */
@@ -1742,6 +1748,9 @@ static void multipath_presuspend(struct dm_target *ti)
{
struct multipath *m = ti->private;
+ spin_lock_irq(&m->lock);
+ m->is_suspending = true;
+ spin_unlock_irq(&m->lock);
/* FIXME: bio-based shouldn't need to always disable queue_if_no_path */
if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti))
queue_if_no_path(m, false, true, __func__);
@@ -1762,9 +1771,9 @@ static void multipath_postsuspend(struct dm_target *ti)
static void multipath_resume(struct dm_target *ti)
{
struct multipath *m = ti->private;
- unsigned long flags;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
+ m->is_suspending = false;
if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) {
set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
@@ -1775,7 +1784,7 @@ static void multipath_resume(struct dm_target *ti)
test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
}
/*
@@ -1798,14 +1807,13 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
unsigned int status_flags, char *result, unsigned int maxlen)
{
int sz = 0, pg_counter, pgpath_counter;
- unsigned long flags;
struct multipath *m = ti->private;
struct priority_group *pg;
struct pgpath *p;
unsigned int pg_num;
char state;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
/* Features */
if (type == STATUSTYPE_INFO)
@@ -1845,10 +1853,10 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
DMEMIT("%u ", m->nr_priority_groups);
- if (m->next_pg)
- pg_num = m->next_pg->pg_num;
- else if (m->current_pg)
+ if (m->current_pg)
pg_num = m->current_pg->pg_num;
+ else if (m->next_pg)
+ pg_num = m->next_pg->pg_num;
else
pg_num = (m->nr_priority_groups ? 1 : 0);
@@ -1951,7 +1959,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
break;
}
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
}
static int multipath_message(struct dm_target *ti, unsigned int argc, char **argv,
@@ -1961,7 +1969,6 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg
dev_t dev;
struct multipath *m = ti->private;
action_fn action;
- unsigned long flags;
mutex_lock(&m->work_mutex);
@@ -1973,9 +1980,9 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg
if (argc == 1) {
if (!strcasecmp(argv[0], "queue_if_no_path")) {
r = queue_if_no_path(m, true, false, __func__);
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
enable_nopath_timeout(m);
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
goto out;
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
r = queue_if_no_path(m, false, false, __func__);
@@ -2021,14 +2028,132 @@ out:
return r;
}
+/*
+ * Perform a minimal read from the given path to find out whether the
+ * path still works. If a path error occurs, fail it.
+ */
+static int probe_path(struct pgpath *pgpath)
+{
+ struct block_device *bdev = pgpath->path.dev->bdev;
+ unsigned int read_size = bdev_logical_block_size(bdev);
+ struct page *page;
+ struct bio *bio;
+ blk_status_t status;
+ int r = 0;
+
+ if (WARN_ON_ONCE(read_size > PAGE_SIZE))
+ return -EINVAL;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ /* Perform a minimal read: Sector 0, length read_size */
+ bio = bio_alloc(bdev, 1, REQ_OP_READ, GFP_KERNEL);
+ if (!bio) {
+ r = -ENOMEM;
+ goto out;
+ }
+
+ bio->bi_iter.bi_sector = 0;
+ __bio_add_page(bio, page, read_size, 0);
+ submit_bio_wait(bio);
+ status = bio->bi_status;
+ bio_put(bio);
+
+ if (status && blk_path_error(status))
+ fail_path(pgpath);
+
+out:
+ __free_page(page);
+ return r;
+}
+
+/*
+ * Probe all active paths in current_pg to find out whether they still work.
+ * Fail all paths that do not work.
+ *
+ * Return -ENOTCONN if no valid path is left (even outside of current_pg). We
+ * cannot probe paths in other pgs without switching current_pg, so if valid
+ * paths are only in different pgs, they may or may not work. Additionally
+ * we should not probe paths in a pathgroup that is in the process of
+ * Initializing. Userspace can submit a request and we'll switch and wait
+ * for the pathgroup to be initialized. If the request fails, it may need to
+ * probe again.
+ */
+static int probe_active_paths(struct multipath *m)
+{
+ struct pgpath *pgpath;
+ struct priority_group *pg = NULL;
+ int r = 0;
+
+ spin_lock_irq(&m->lock);
+ if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) {
+ wait_event_lock_irq(m->probe_wait,
+ !test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags),
+ m->lock);
+ /*
+ * if we waited because a probe was already in progress,
+ * and it probed the current active pathgroup, don't
+ * reprobe. Just return the number of valid paths
+ */
+ if (m->current_pg == m->last_probed_pg)
+ goto skip_probe;
+ }
+ if (!m->current_pg || m->is_suspending ||
+ test_bit(MPATHF_QUEUE_IO, &m->flags))
+ goto skip_probe;
+ set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags);
+ pg = m->last_probed_pg = m->current_pg;
+ spin_unlock_irq(&m->lock);
+
+ list_for_each_entry(pgpath, &pg->pgpaths, list) {
+ if (pg != READ_ONCE(m->current_pg) ||
+ READ_ONCE(m->is_suspending))
+ goto out;
+ if (!pgpath->is_active)
+ continue;
+
+ r = probe_path(pgpath);
+ if (r < 0)
+ goto out;
+ }
+
+out:
+ spin_lock_irq(&m->lock);
+ clear_bit(MPATHF_DELAY_PG_SWITCH, &m->flags);
+ if (test_and_clear_bit(MPATHF_NEED_PG_SWITCH, &m->flags)) {
+ m->current_pgpath = NULL;
+ m->current_pg = NULL;
+ }
+skip_probe:
+ if (r == 0 && !atomic_read(&m->nr_valid_paths))
+ r = -ENOTCONN;
+ spin_unlock_irq(&m->lock);
+ if (pg)
+ wake_up(&m->probe_wait);
+ return r;
+}
+
static int multipath_prepare_ioctl(struct dm_target *ti,
- struct block_device **bdev)
+ struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct multipath *m = ti->private;
struct pgpath *pgpath;
- unsigned long flags;
int r;
+ if (_IOC_TYPE(cmd) == DM_IOCTL) {
+ *forward = false;
+ switch (cmd) {
+ case DM_MPATH_PROBE_PATHS:
+ return probe_active_paths(m);
+ default:
+ return -ENOTTY;
+ }
+ }
+
pgpath = READ_ONCE(m->current_pgpath);
if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
pgpath = choose_pgpath(m, 0);
@@ -2044,10 +2169,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
} else {
/* No path is available */
r = -EIO;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
r = -ENOTCONN;
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
}
if (r == -ENOTCONN) {
@@ -2055,10 +2180,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
/* Path status changed, redo selection */
(void) choose_pgpath(m, 0);
}
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
(void) __pg_init_all_paths(m);
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
dm_table_run_md_queue_async(m->ti->table);
process_queued_io_list(m);
}
@@ -2180,7 +2305,7 @@ static int multipath_busy(struct dm_target *ti)
*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 14, 0},
+ .version = {1, 15, 0},
.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE |
DM_TARGET_PASSES_INTEGRITY,
.module = THIS_MODULE,
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
index 3e4cb81ce512..d0b883fabfeb 100644
--- a/drivers/md/dm-path-selector.c
+++ b/drivers/md/dm-path-selector.c
@@ -117,16 +117,16 @@ int dm_register_path_selector(struct path_selector_type *pst)
}
EXPORT_SYMBOL_GPL(dm_register_path_selector);
-int dm_unregister_path_selector(struct path_selector_type *pst)
+void dm_unregister_path_selector(struct path_selector_type *pst)
{
struct ps_internal *psi;
down_write(&_ps_lock);
psi = __find_path_selector_type(pst->name);
- if (!psi) {
+ if (WARN_ON(!psi)) {
up_write(&_ps_lock);
- return -EINVAL;
+ return;
}
list_del(&psi->list);
@@ -134,7 +134,5 @@ int dm_unregister_path_selector(struct path_selector_type *pst)
up_write(&_ps_lock);
kfree(psi);
-
- return 0;
}
EXPORT_SYMBOL_GPL(dm_unregister_path_selector);
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index 3861b2d8b963..7b2270532e64 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -96,7 +96,7 @@ struct path_selector_type {
int dm_register_path_selector(struct path_selector_type *type);
/* Unregister a path selector */
-int dm_unregister_path_selector(struct path_selector_type *type);
+void dm_unregister_path_selector(struct path_selector_type *type);
/* Returns a registered path selector type */
struct path_selector_type *dm_get_path_selector(const char *name);
diff --git a/drivers/md/dm-pcache/Kconfig b/drivers/md/dm-pcache/Kconfig
new file mode 100644
index 000000000000..0e251eca892e
--- /dev/null
+++ b/drivers/md/dm-pcache/Kconfig
@@ -0,0 +1,17 @@
+config DM_PCACHE
+ tristate "Persistent cache for Block Device (Experimental)"
+ depends on BLK_DEV_DM
+ depends on DEV_DAX
+ help
+ PCACHE provides a mechanism to use persistent memory (e.g., CXL persistent memory,
+ DAX-enabled devices) as a high-performance cache layer in front of
+ traditional block devices such as SSDs or HDDs.
+
+ PCACHE is implemented as a kernel module that integrates with the block
+ layer and supports direct access (DAX) to persistent memory for low-latency,
+ byte-addressable caching.
+
+ Note: This feature is experimental and should be tested thoroughly
+ before use in production environments.
+
+ If unsure, say 'N'.
diff --git a/drivers/md/dm-pcache/Makefile b/drivers/md/dm-pcache/Makefile
new file mode 100644
index 000000000000..cedfd38854f6
--- /dev/null
+++ b/drivers/md/dm-pcache/Makefile
@@ -0,0 +1,3 @@
+dm-pcache-y := dm_pcache.o cache_dev.o segment.o backing_dev.o cache.o cache_gc.o cache_writeback.o cache_segment.o cache_key.o cache_req.o
+
+obj-$(CONFIG_DM_PCACHE) += dm-pcache.o
diff --git a/drivers/md/dm-pcache/backing_dev.c b/drivers/md/dm-pcache/backing_dev.c
new file mode 100644
index 000000000000..7165fc0364bb
--- /dev/null
+++ b/drivers/md/dm-pcache/backing_dev.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/blkdev.h>
+
+#include "../dm-core.h"
+#include "pcache_internal.h"
+#include "cache_dev.h"
+#include "backing_dev.h"
+#include "cache.h"
+#include "dm_pcache.h"
+
+static struct kmem_cache *backing_req_cache;
+static struct kmem_cache *backing_bvec_cache;
+
+static void backing_dev_exit(struct pcache_backing_dev *backing_dev)
+{
+ mempool_exit(&backing_dev->req_pool);
+ mempool_exit(&backing_dev->bvec_pool);
+}
+
+static void req_submit_fn(struct work_struct *work);
+static void req_complete_fn(struct work_struct *work);
+static int backing_dev_init(struct dm_pcache *pcache)
+{
+ struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+ int ret;
+
+ ret = mempool_init_slab_pool(&backing_dev->req_pool, 128, backing_req_cache);
+ if (ret)
+ goto err;
+
+ ret = mempool_init_slab_pool(&backing_dev->bvec_pool, 128, backing_bvec_cache);
+ if (ret)
+ goto req_pool_exit;
+
+ INIT_LIST_HEAD(&backing_dev->submit_list);
+ INIT_LIST_HEAD(&backing_dev->complete_list);
+ spin_lock_init(&backing_dev->submit_lock);
+ spin_lock_init(&backing_dev->complete_lock);
+ INIT_WORK(&backing_dev->req_submit_work, req_submit_fn);
+ INIT_WORK(&backing_dev->req_complete_work, req_complete_fn);
+ atomic_set(&backing_dev->inflight_reqs, 0);
+ init_waitqueue_head(&backing_dev->inflight_wq);
+
+ return 0;
+
+req_pool_exit:
+ mempool_exit(&backing_dev->req_pool);
+err:
+ return ret;
+}
+
+int backing_dev_start(struct dm_pcache *pcache)
+{
+ struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+ int ret;
+
+ ret = backing_dev_init(pcache);
+ if (ret)
+ return ret;
+
+ backing_dev->dev_size = bdev_nr_sectors(backing_dev->dm_dev->bdev);
+
+ return 0;
+}
+
+void backing_dev_stop(struct dm_pcache *pcache)
+{
+ struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+
+ /*
+ * There should not be any new request comming, just wait
+ * inflight requests done.
+ */
+ wait_event(backing_dev->inflight_wq,
+ atomic_read(&backing_dev->inflight_reqs) == 0);
+
+ flush_work(&backing_dev->req_submit_work);
+ flush_work(&backing_dev->req_complete_work);
+
+ backing_dev_exit(backing_dev);
+}
+
+/* pcache_backing_dev_req functions */
+void backing_dev_req_end(struct pcache_backing_dev_req *backing_req)
+{
+ struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
+
+ if (backing_req->end_req)
+ backing_req->end_req(backing_req, backing_req->ret);
+
+ switch (backing_req->type) {
+ case BACKING_DEV_REQ_TYPE_REQ:
+ if (backing_req->req.upper_req)
+ pcache_req_put(backing_req->req.upper_req, backing_req->ret);
+ break;
+ case BACKING_DEV_REQ_TYPE_KMEM:
+ if (backing_req->kmem.bvecs != backing_req->kmem.inline_bvecs)
+ mempool_free(backing_req->kmem.bvecs, &backing_dev->bvec_pool);
+ break;
+ default:
+ BUG();
+ }
+
+ mempool_free(backing_req, &backing_dev->req_pool);
+
+ if (atomic_dec_and_test(&backing_dev->inflight_reqs))
+ wake_up(&backing_dev->inflight_wq);
+}
+
+static void req_complete_fn(struct work_struct *work)
+{
+ struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_complete_work);
+ struct pcache_backing_dev_req *backing_req;
+ LIST_HEAD(tmp_list);
+
+ spin_lock_irq(&backing_dev->complete_lock);
+ list_splice_init(&backing_dev->complete_list, &tmp_list);
+ spin_unlock_irq(&backing_dev->complete_lock);
+
+ while (!list_empty(&tmp_list)) {
+ backing_req = list_first_entry(&tmp_list,
+ struct pcache_backing_dev_req, node);
+ list_del_init(&backing_req->node);
+ backing_dev_req_end(backing_req);
+ }
+}
+
+static void backing_dev_bio_end(struct bio *bio)
+{
+ struct pcache_backing_dev_req *backing_req = bio->bi_private;
+ struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
+ unsigned long flags;
+
+ backing_req->ret = blk_status_to_errno(bio->bi_status);
+
+ spin_lock_irqsave(&backing_dev->complete_lock, flags);
+ list_move_tail(&backing_req->node, &backing_dev->complete_list);
+ queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_complete_work);
+ spin_unlock_irqrestore(&backing_dev->complete_lock, flags);
+}
+
+static void req_submit_fn(struct work_struct *work)
+{
+ struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_submit_work);
+ struct pcache_backing_dev_req *backing_req;
+ LIST_HEAD(tmp_list);
+
+ spin_lock(&backing_dev->submit_lock);
+ list_splice_init(&backing_dev->submit_list, &tmp_list);
+ spin_unlock(&backing_dev->submit_lock);
+
+ while (!list_empty(&tmp_list)) {
+ backing_req = list_first_entry(&tmp_list,
+ struct pcache_backing_dev_req, node);
+ list_del_init(&backing_req->node);
+ submit_bio_noacct(&backing_req->bio);
+ }
+}
+
+void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct)
+{
+ struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
+
+ if (direct) {
+ submit_bio_noacct(&backing_req->bio);
+ return;
+ }
+
+ spin_lock(&backing_dev->submit_lock);
+ list_add_tail(&backing_req->node, &backing_dev->submit_list);
+ queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_submit_work);
+ spin_unlock(&backing_dev->submit_lock);
+}
+
+static void bio_map(struct bio *bio, void *base, size_t size)
+{
+ struct page *page;
+ unsigned int offset;
+ unsigned int len;
+
+ if (!is_vmalloc_addr(base)) {
+ page = virt_to_page(base);
+ offset = offset_in_page(base);
+
+ BUG_ON(!bio_add_page(bio, page, size, offset));
+ return;
+ }
+
+ flush_kernel_vmap_range(base, size);
+ while (size) {
+ page = vmalloc_to_page(base);
+ offset = offset_in_page(base);
+ len = min_t(size_t, PAGE_SIZE - offset, size);
+
+ BUG_ON(!bio_add_page(bio, page, len, offset));
+ size -= len;
+ base += len;
+ }
+}
+
+static struct pcache_backing_dev_req *req_type_req_alloc(struct pcache_backing_dev *backing_dev,
+ struct pcache_backing_dev_req_opts *opts)
+{
+ struct pcache_request *pcache_req = opts->req.upper_req;
+ struct pcache_backing_dev_req *backing_req;
+ struct bio *orig = pcache_req->bio;
+
+ backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask);
+ if (!backing_req)
+ return NULL;
+
+ memset(backing_req, 0, sizeof(struct pcache_backing_dev_req));
+
+ bio_init_clone(backing_dev->dm_dev->bdev, &backing_req->bio, orig, opts->gfp_mask);
+
+ backing_req->type = BACKING_DEV_REQ_TYPE_REQ;
+ backing_req->backing_dev = backing_dev;
+ atomic_inc(&backing_dev->inflight_reqs);
+
+ return backing_req;
+}
+
+static struct pcache_backing_dev_req *kmem_type_req_alloc(struct pcache_backing_dev *backing_dev,
+ struct pcache_backing_dev_req_opts *opts)
+{
+ struct pcache_backing_dev_req *backing_req;
+ u32 n_vecs = bio_add_max_vecs(opts->kmem.data, opts->kmem.len);
+
+ backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask);
+ if (!backing_req)
+ return NULL;
+
+ memset(backing_req, 0, sizeof(struct pcache_backing_dev_req));
+
+ if (n_vecs > BACKING_DEV_REQ_INLINE_BVECS) {
+ backing_req->kmem.bvecs = mempool_alloc(&backing_dev->bvec_pool, opts->gfp_mask);
+ if (!backing_req->kmem.bvecs)
+ goto free_backing_req;
+ } else {
+ backing_req->kmem.bvecs = backing_req->kmem.inline_bvecs;
+ }
+
+ backing_req->kmem.n_vecs = n_vecs;
+ backing_req->type = BACKING_DEV_REQ_TYPE_KMEM;
+ backing_req->backing_dev = backing_dev;
+ atomic_inc(&backing_dev->inflight_reqs);
+
+ return backing_req;
+
+free_backing_req:
+ mempool_free(backing_req, &backing_dev->req_pool);
+ return NULL;
+}
+
+struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev,
+ struct pcache_backing_dev_req_opts *opts)
+{
+ if (opts->type == BACKING_DEV_REQ_TYPE_REQ)
+ return req_type_req_alloc(backing_dev, opts);
+
+ if (opts->type == BACKING_DEV_REQ_TYPE_KMEM)
+ return kmem_type_req_alloc(backing_dev, opts);
+
+ BUG();
+}
+
+static void req_type_req_init(struct pcache_backing_dev_req *backing_req,
+ struct pcache_backing_dev_req_opts *opts)
+{
+ struct pcache_request *pcache_req = opts->req.upper_req;
+ struct bio *clone;
+ u32 off = opts->req.req_off;
+ u32 len = opts->req.len;
+
+ clone = &backing_req->bio;
+ BUG_ON(off & SECTOR_MASK);
+ BUG_ON(len & SECTOR_MASK);
+ bio_trim(clone, off >> SECTOR_SHIFT, len >> SECTOR_SHIFT);
+
+ clone->bi_iter.bi_sector = (pcache_req->off + off) >> SECTOR_SHIFT;
+ clone->bi_private = backing_req;
+ clone->bi_end_io = backing_dev_bio_end;
+
+ INIT_LIST_HEAD(&backing_req->node);
+ backing_req->end_req = opts->end_fn;
+
+ pcache_req_get(pcache_req);
+ backing_req->req.upper_req = pcache_req;
+ backing_req->req.bio_off = off;
+}
+
+static void kmem_type_req_init(struct pcache_backing_dev_req *backing_req,
+ struct pcache_backing_dev_req_opts *opts)
+{
+ struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
+ struct bio *backing_bio;
+
+ bio_init(&backing_req->bio, backing_dev->dm_dev->bdev, backing_req->kmem.bvecs,
+ backing_req->kmem.n_vecs, opts->kmem.opf);
+
+ backing_bio = &backing_req->bio;
+ bio_map(backing_bio, opts->kmem.data, opts->kmem.len);
+
+ backing_bio->bi_iter.bi_sector = (opts->kmem.backing_off) >> SECTOR_SHIFT;
+ backing_bio->bi_private = backing_req;
+ backing_bio->bi_end_io = backing_dev_bio_end;
+
+ INIT_LIST_HEAD(&backing_req->node);
+ backing_req->end_req = opts->end_fn;
+ backing_req->priv_data = opts->priv_data;
+}
+
+void backing_dev_req_init(struct pcache_backing_dev_req *backing_req,
+ struct pcache_backing_dev_req_opts *opts)
+{
+ if (opts->type == BACKING_DEV_REQ_TYPE_REQ)
+ return req_type_req_init(backing_req, opts);
+
+ if (opts->type == BACKING_DEV_REQ_TYPE_KMEM)
+ return kmem_type_req_init(backing_req, opts);
+
+ BUG();
+}
+
+struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev,
+ struct pcache_backing_dev_req_opts *opts)
+{
+ struct pcache_backing_dev_req *backing_req;
+
+ backing_req = backing_dev_req_alloc(backing_dev, opts);
+ if (!backing_req)
+ return NULL;
+
+ backing_dev_req_init(backing_req, opts);
+
+ return backing_req;
+}
+
+void backing_dev_flush(struct pcache_backing_dev *backing_dev)
+{
+ blkdev_issue_flush(backing_dev->dm_dev->bdev);
+}
+
+int pcache_backing_init(void)
+{
+ u32 max_bvecs = (PCACHE_CACHE_SUBTREE_SIZE >> PAGE_SHIFT) + 1;
+ int ret;
+
+ backing_req_cache = KMEM_CACHE(pcache_backing_dev_req, 0);
+ if (!backing_req_cache) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ backing_bvec_cache = kmem_cache_create("pcache-bvec-slab",
+ max_bvecs * sizeof(struct bio_vec),
+ 0, 0, NULL);
+ if (!backing_bvec_cache) {
+ ret = -ENOMEM;
+ goto destroy_req_cache;
+ }
+
+ return 0;
+destroy_req_cache:
+ kmem_cache_destroy(backing_req_cache);
+err:
+ return ret;
+}
+
+void pcache_backing_exit(void)
+{
+ kmem_cache_destroy(backing_bvec_cache);
+ kmem_cache_destroy(backing_req_cache);
+}
diff --git a/drivers/md/dm-pcache/backing_dev.h b/drivers/md/dm-pcache/backing_dev.h
new file mode 100644
index 000000000000..b371cba483b9
--- /dev/null
+++ b/drivers/md/dm-pcache/backing_dev.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _BACKING_DEV_H
+#define _BACKING_DEV_H
+
+#include <linux/device-mapper.h>
+
+#include "pcache_internal.h"
+
+struct pcache_backing_dev_req;
+typedef void (*backing_req_end_fn_t)(struct pcache_backing_dev_req *backing_req, int ret);
+
+#define BACKING_DEV_REQ_TYPE_REQ 1
+#define BACKING_DEV_REQ_TYPE_KMEM 2
+
+#define BACKING_DEV_REQ_INLINE_BVECS 4
+
+struct pcache_request;
+struct pcache_backing_dev_req {
+ u8 type;
+ struct bio bio;
+ struct pcache_backing_dev *backing_dev;
+
+ void *priv_data;
+ backing_req_end_fn_t end_req;
+
+ struct list_head node;
+ int ret;
+
+ union {
+ struct {
+ struct pcache_request *upper_req;
+ u32 bio_off;
+ } req;
+ struct {
+ struct bio_vec inline_bvecs[BACKING_DEV_REQ_INLINE_BVECS];
+ struct bio_vec *bvecs;
+ u32 n_vecs;
+ } kmem;
+ };
+};
+
+struct pcache_backing_dev {
+ struct pcache_cache *cache;
+
+ struct dm_dev *dm_dev;
+ mempool_t req_pool;
+ mempool_t bvec_pool;
+
+ struct list_head submit_list;
+ spinlock_t submit_lock;
+ struct work_struct req_submit_work;
+
+ struct list_head complete_list;
+ spinlock_t complete_lock;
+ struct work_struct req_complete_work;
+
+ atomic_t inflight_reqs;
+ wait_queue_head_t inflight_wq;
+
+ u64 dev_size;
+};
+
+struct dm_pcache;
+int backing_dev_start(struct dm_pcache *pcache);
+void backing_dev_stop(struct dm_pcache *pcache);
+
+struct pcache_backing_dev_req_opts {
+ u32 type;
+ union {
+ struct {
+ struct pcache_request *upper_req;
+ u32 req_off;
+ u32 len;
+ } req;
+ struct {
+ void *data;
+ blk_opf_t opf;
+ u32 len;
+ u64 backing_off;
+ } kmem;
+ };
+
+ gfp_t gfp_mask;
+ backing_req_end_fn_t end_fn;
+ void *priv_data;
+};
+
+static inline u32 backing_dev_req_coalesced_max_len(const void *data, u32 len)
+{
+ const void *p = data;
+ u32 done = 0, in_page, to_advance;
+ struct page *first_page, *next_page;
+
+ if (!is_vmalloc_addr(data))
+ return len;
+
+ first_page = vmalloc_to_page(p);
+advance:
+ in_page = PAGE_SIZE - offset_in_page(p);
+ to_advance = min_t(u32, in_page, len - done);
+
+ done += to_advance;
+ p += to_advance;
+
+ if (done == len)
+ return done;
+
+ next_page = vmalloc_to_page(p);
+ if (zone_device_pages_have_same_pgmap(first_page, next_page))
+ goto advance;
+
+ return done;
+}
+
+void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct);
+void backing_dev_req_end(struct pcache_backing_dev_req *backing_req);
+struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev,
+ struct pcache_backing_dev_req_opts *opts);
+struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev,
+ struct pcache_backing_dev_req_opts *opts);
+void backing_dev_req_init(struct pcache_backing_dev_req *backing_req,
+ struct pcache_backing_dev_req_opts *opts);
+void backing_dev_flush(struct pcache_backing_dev *backing_dev);
+
+int pcache_backing_init(void);
+void pcache_backing_exit(void);
+#endif /* _BACKING_DEV_H */
diff --git a/drivers/md/dm-pcache/cache.c b/drivers/md/dm-pcache/cache.c
new file mode 100644
index 000000000000..698697a7a73c
--- /dev/null
+++ b/drivers/md/dm-pcache/cache.c
@@ -0,0 +1,445 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/blk_types.h>
+
+#include "cache.h"
+#include "cache_dev.h"
+#include "backing_dev.h"
+#include "dm_pcache.h"
+
+struct kmem_cache *key_cache;
+
+static inline struct pcache_cache_info *get_cache_info_addr(struct pcache_cache *cache)
+{
+ return cache->cache_info_addr + cache->info_index;
+}
+
+static void cache_info_write(struct pcache_cache *cache)
+{
+ struct pcache_cache_info *cache_info = &cache->cache_info;
+
+ cache_info->header.seq++;
+ cache_info->header.crc = pcache_meta_crc(&cache_info->header,
+ sizeof(struct pcache_cache_info));
+
+ memcpy_flushcache(get_cache_info_addr(cache), cache_info,
+ sizeof(struct pcache_cache_info));
+
+ cache->info_index = (cache->info_index + 1) % PCACHE_META_INDEX_MAX;
+}
+
+static void cache_info_init_default(struct pcache_cache *cache);
+static int cache_info_init(struct pcache_cache *cache, struct pcache_cache_options *opts)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_info *cache_info_addr;
+
+ cache_info_addr = pcache_meta_find_latest(&cache->cache_info_addr->header,
+ sizeof(struct pcache_cache_info),
+ PCACHE_CACHE_INFO_SIZE,
+ &cache->cache_info);
+ if (IS_ERR(cache_info_addr))
+ return PTR_ERR(cache_info_addr);
+
+ if (cache_info_addr) {
+ if (opts->data_crc !=
+ (cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC)) {
+ pcache_dev_err(pcache, "invalid option for data_crc: %s, expected: %s",
+ opts->data_crc ? "true" : "false",
+ cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC ? "true" : "false");
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ /* init cache_info for new cache */
+ cache_info_init_default(cache);
+ cache_mode_set(cache, opts->cache_mode);
+ if (opts->data_crc)
+ cache->cache_info.flags |= PCACHE_CACHE_FLAGS_DATA_CRC;
+
+ return 0;
+}
+
+static void cache_info_set_gc_percent(struct pcache_cache_info *cache_info, u8 percent)
+{
+ cache_info->flags &= ~PCACHE_CACHE_FLAGS_GC_PERCENT_MASK;
+ cache_info->flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, percent);
+}
+
+int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent)
+{
+ if (percent > PCACHE_CACHE_GC_PERCENT_MAX || percent < PCACHE_CACHE_GC_PERCENT_MIN)
+ return -EINVAL;
+
+ mutex_lock(&cache->cache_info_lock);
+ cache_info_set_gc_percent(&cache->cache_info, percent);
+
+ cache_info_write(cache);
+ mutex_unlock(&cache->cache_info_lock);
+
+ return 0;
+}
+
+void cache_pos_encode(struct pcache_cache *cache,
+ struct pcache_cache_pos_onmedia *pos_onmedia_base,
+ struct pcache_cache_pos *pos, u64 seq, u32 *index)
+{
+ struct pcache_cache_pos_onmedia pos_onmedia;
+ struct pcache_cache_pos_onmedia *pos_onmedia_addr = pos_onmedia_base + *index;
+
+ pos_onmedia.cache_seg_id = pos->cache_seg->cache_seg_id;
+ pos_onmedia.seg_off = pos->seg_off;
+ pos_onmedia.header.seq = seq;
+ pos_onmedia.header.crc = cache_pos_onmedia_crc(&pos_onmedia);
+
+ memcpy_flushcache(pos_onmedia_addr, &pos_onmedia, sizeof(struct pcache_cache_pos_onmedia));
+ pmem_wmb();
+
+ *index = (*index + 1) % PCACHE_META_INDEX_MAX;
+}
+
+int cache_pos_decode(struct pcache_cache *cache,
+ struct pcache_cache_pos_onmedia *pos_onmedia,
+ struct pcache_cache_pos *pos, u64 *seq, u32 *index)
+{
+ struct pcache_cache_pos_onmedia latest, *latest_addr;
+
+ latest_addr = pcache_meta_find_latest(&pos_onmedia->header,
+ sizeof(struct pcache_cache_pos_onmedia),
+ sizeof(struct pcache_cache_pos_onmedia),
+ &latest);
+ if (IS_ERR(latest_addr))
+ return PTR_ERR(latest_addr);
+
+ if (!latest_addr)
+ return -EIO;
+
+ pos->cache_seg = &cache->segments[latest.cache_seg_id];
+ pos->seg_off = latest.seg_off;
+ *seq = latest.header.seq;
+ *index = (latest_addr - pos_onmedia);
+
+ return 0;
+}
+
+static inline void cache_info_set_seg_id(struct pcache_cache *cache, u32 seg_id)
+{
+ cache->cache_info.seg_id = seg_id;
+}
+
+static int cache_init(struct dm_pcache *pcache)
+{
+ struct pcache_cache *cache = &pcache->cache;
+ struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+ struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
+ int ret;
+
+ cache->segments = kvcalloc(cache_dev->seg_num, sizeof(struct pcache_cache_segment), GFP_KERNEL);
+ if (!cache->segments) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ cache->seg_map = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL);
+ if (!cache->seg_map) {
+ ret = -ENOMEM;
+ goto free_segments;
+ }
+
+ cache->backing_dev = backing_dev;
+ cache->cache_dev = &pcache->cache_dev;
+ cache->n_segs = cache_dev->seg_num;
+ atomic_set(&cache->gc_errors, 0);
+ spin_lock_init(&cache->seg_map_lock);
+ spin_lock_init(&cache->key_head_lock);
+
+ mutex_init(&cache->cache_info_lock);
+ mutex_init(&cache->key_tail_lock);
+ mutex_init(&cache->dirty_tail_lock);
+ mutex_init(&cache->writeback_lock);
+
+ INIT_DELAYED_WORK(&cache->writeback_work, cache_writeback_fn);
+ INIT_DELAYED_WORK(&cache->gc_work, pcache_cache_gc_fn);
+ INIT_WORK(&cache->clean_work, clean_fn);
+
+ return 0;
+
+free_segments:
+ kvfree(cache->segments);
+err:
+ return ret;
+}
+
+static void cache_exit(struct pcache_cache *cache)
+{
+ kvfree(cache->seg_map);
+ kvfree(cache->segments);
+}
+
+static void cache_info_init_default(struct pcache_cache *cache)
+{
+ struct pcache_cache_info *cache_info = &cache->cache_info;
+
+ memset(cache_info, 0, sizeof(*cache_info));
+ cache_info->n_segs = cache->cache_dev->seg_num;
+ cache_info_set_gc_percent(cache_info, PCACHE_CACHE_GC_PERCENT_DEFAULT);
+}
+
+static int cache_tail_init(struct pcache_cache *cache)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE);
+
+ if (new_cache) {
+ __set_bit(0, cache->seg_map);
+
+ cache->key_head.cache_seg = &cache->segments[0];
+ cache->key_head.seg_off = 0;
+ cache_pos_copy(&cache->key_tail, &cache->key_head);
+ cache_pos_copy(&cache->dirty_tail, &cache->key_head);
+
+ cache_encode_dirty_tail(cache);
+ cache_encode_key_tail(cache);
+ } else {
+ if (cache_decode_key_tail(cache) || cache_decode_dirty_tail(cache)) {
+ pcache_dev_err(pcache, "Corrupted key tail or dirty tail.\n");
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static int get_seg_id(struct pcache_cache *cache,
+ struct pcache_cache_segment *prev_cache_seg,
+ bool new_cache, u32 *seg_id)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_dev *cache_dev = cache->cache_dev;
+ int ret;
+
+ if (new_cache) {
+ ret = cache_dev_get_empty_segment_id(cache_dev, seg_id);
+ if (ret) {
+ pcache_dev_err(pcache, "no available segment\n");
+ goto err;
+ }
+
+ if (prev_cache_seg)
+ cache_seg_set_next_seg(prev_cache_seg, *seg_id);
+ else
+ cache_info_set_seg_id(cache, *seg_id);
+ } else {
+ if (prev_cache_seg) {
+ struct pcache_segment_info *prev_seg_info;
+
+ prev_seg_info = &prev_cache_seg->cache_seg_info;
+ if (!segment_info_has_next(prev_seg_info)) {
+ ret = -EFAULT;
+ goto err;
+ }
+ *seg_id = prev_cache_seg->cache_seg_info.next_seg;
+ } else {
+ *seg_id = cache->cache_info.seg_id;
+ }
+ }
+ return 0;
+err:
+ return ret;
+}
+
+static int cache_segs_init(struct pcache_cache *cache)
+{
+ struct pcache_cache_segment *prev_cache_seg = NULL;
+ struct pcache_cache_info *cache_info = &cache->cache_info;
+ bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE);
+ u32 seg_id;
+ int ret;
+ u32 i;
+
+ for (i = 0; i < cache_info->n_segs; i++) {
+ ret = get_seg_id(cache, prev_cache_seg, new_cache, &seg_id);
+ if (ret)
+ goto err;
+
+ ret = cache_seg_init(cache, seg_id, i, new_cache);
+ if (ret)
+ goto err;
+
+ prev_cache_seg = &cache->segments[i];
+ }
+ return 0;
+err:
+ return ret;
+}
+
+static int cache_init_req_keys(struct pcache_cache *cache, u32 n_paral)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ u32 n_subtrees;
+ int ret;
+ u32 i, cpu;
+
+ /* Calculate number of cache trees based on the device size */
+ n_subtrees = DIV_ROUND_UP(cache->dev_size << SECTOR_SHIFT, PCACHE_CACHE_SUBTREE_SIZE);
+ ret = cache_tree_init(cache, &cache->req_key_tree, n_subtrees);
+ if (ret)
+ goto err;
+
+ cache->n_ksets = n_paral;
+ cache->ksets = kvcalloc(cache->n_ksets, PCACHE_KSET_SIZE, GFP_KERNEL);
+ if (!cache->ksets) {
+ ret = -ENOMEM;
+ goto req_tree_exit;
+ }
+
+ /*
+ * Initialize each kset with a spinlock and delayed work for flushing.
+ * Each kset is associated with one queue to ensure independent handling
+ * of cache keys across multiple queues, maximizing multiqueue concurrency.
+ */
+ for (i = 0; i < cache->n_ksets; i++) {
+ struct pcache_cache_kset *kset = get_kset(cache, i);
+
+ kset->cache = cache;
+ spin_lock_init(&kset->kset_lock);
+ INIT_DELAYED_WORK(&kset->flush_work, kset_flush_fn);
+ }
+
+ cache->data_heads = alloc_percpu(struct pcache_cache_data_head);
+ if (!cache->data_heads) {
+ ret = -ENOMEM;
+ goto free_kset;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct pcache_cache_data_head *h =
+ per_cpu_ptr(cache->data_heads, cpu);
+ h->head_pos.cache_seg = NULL;
+ }
+
+ /*
+ * Replay persisted cache keys using cache_replay.
+ * This function loads and replays cache keys from previously stored
+ * ksets, allowing the cache to restore its state after a restart.
+ */
+ ret = cache_replay(cache);
+ if (ret) {
+ pcache_dev_err(pcache, "failed to replay keys\n");
+ goto free_heads;
+ }
+
+ return 0;
+
+free_heads:
+ free_percpu(cache->data_heads);
+free_kset:
+ kvfree(cache->ksets);
+req_tree_exit:
+ cache_tree_exit(&cache->req_key_tree);
+err:
+ return ret;
+}
+
+static void cache_destroy_req_keys(struct pcache_cache *cache)
+{
+ u32 i;
+
+ for (i = 0; i < cache->n_ksets; i++) {
+ struct pcache_cache_kset *kset = get_kset(cache, i);
+
+ cancel_delayed_work_sync(&kset->flush_work);
+ }
+
+ free_percpu(cache->data_heads);
+ kvfree(cache->ksets);
+ cache_tree_exit(&cache->req_key_tree);
+}
+
+int pcache_cache_start(struct dm_pcache *pcache)
+{
+ struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+ struct pcache_cache *cache = &pcache->cache;
+ struct pcache_cache_options *opts = &pcache->opts;
+ int ret;
+
+ ret = cache_init(pcache);
+ if (ret)
+ return ret;
+
+ cache->cache_info_addr = CACHE_DEV_CACHE_INFO(cache->cache_dev);
+ cache->cache_ctrl = CACHE_DEV_CACHE_CTRL(cache->cache_dev);
+ backing_dev->cache = cache;
+ cache->dev_size = backing_dev->dev_size;
+
+ ret = cache_info_init(cache, opts);
+ if (ret)
+ goto cache_exit;
+
+ ret = cache_segs_init(cache);
+ if (ret)
+ goto cache_exit;
+
+ ret = cache_tail_init(cache);
+ if (ret)
+ goto cache_exit;
+
+ ret = cache_init_req_keys(cache, num_online_cpus());
+ if (ret)
+ goto cache_exit;
+
+ ret = cache_writeback_init(cache);
+ if (ret)
+ goto destroy_keys;
+
+ cache->cache_info.flags |= PCACHE_CACHE_FLAGS_INIT_DONE;
+ cache_info_write(cache);
+ queue_delayed_work(cache_get_wq(cache), &cache->gc_work, 0);
+
+ return 0;
+
+destroy_keys:
+ cache_destroy_req_keys(cache);
+cache_exit:
+ cache_exit(cache);
+
+ return ret;
+}
+
+void pcache_cache_stop(struct dm_pcache *pcache)
+{
+ struct pcache_cache *cache = &pcache->cache;
+
+ pcache_cache_flush(cache);
+
+ cancel_delayed_work_sync(&cache->gc_work);
+ flush_work(&cache->clean_work);
+ cache_writeback_exit(cache);
+
+ if (cache->req_key_tree.n_subtrees)
+ cache_destroy_req_keys(cache);
+
+ cache_exit(cache);
+}
+
+struct workqueue_struct *cache_get_wq(struct pcache_cache *cache)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+
+ return pcache->task_wq;
+}
+
+int pcache_cache_init(void)
+{
+ key_cache = KMEM_CACHE(pcache_cache_key, 0);
+ if (!key_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void pcache_cache_exit(void)
+{
+ kmem_cache_destroy(key_cache);
+}
diff --git a/drivers/md/dm-pcache/cache.h b/drivers/md/dm-pcache/cache.h
new file mode 100644
index 000000000000..27613b56be54
--- /dev/null
+++ b/drivers/md/dm-pcache/cache.h
@@ -0,0 +1,635 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _PCACHE_CACHE_H
+#define _PCACHE_CACHE_H
+
+#include "segment.h"
+
+/* Garbage collection thresholds */
+#define PCACHE_CACHE_GC_PERCENT_MIN 0 /* Minimum GC percentage */
+#define PCACHE_CACHE_GC_PERCENT_MAX 90 /* Maximum GC percentage */
+#define PCACHE_CACHE_GC_PERCENT_DEFAULT 70 /* Default GC percentage */
+
+#define PCACHE_CACHE_SUBTREE_SIZE (4 * PCACHE_MB) /* 4MB total tree size */
+#define PCACHE_CACHE_SUBTREE_SIZE_MASK 0x3FFFFF /* Mask for tree size */
+#define PCACHE_CACHE_SUBTREE_SIZE_SHIFT 22 /* Bit shift for tree size */
+
+/* Maximum number of keys per key set */
+#define PCACHE_KSET_KEYS_MAX 128
+#define PCACHE_CACHE_SEGS_MAX (1024 * 1024) /* maximum cache size for each device is 16T */
+#define PCACHE_KSET_ONMEDIA_SIZE_MAX struct_size_t(struct pcache_cache_kset_onmedia, data, PCACHE_KSET_KEYS_MAX)
+#define PCACHE_KSET_SIZE (sizeof(struct pcache_cache_kset) + sizeof(struct pcache_cache_key_onmedia) * PCACHE_KSET_KEYS_MAX)
+
+/* Maximum number of keys to clean in one round of clean_work */
+#define PCACHE_CLEAN_KEYS_MAX 10
+
+/* Writeback and garbage collection intervals in jiffies */
+#define PCACHE_CACHE_WRITEBACK_INTERVAL (5 * HZ)
+#define PCACHE_CACHE_GC_INTERVAL (5 * HZ)
+
+/* Macro to get the cache key structure from an rb_node pointer */
+#define CACHE_KEY(node) (container_of(node, struct pcache_cache_key, rb_node))
+
+struct pcache_cache_pos_onmedia {
+ struct pcache_meta_header header;
+ __u32 cache_seg_id;
+ __u32 seg_off;
+};
+
+/* Offset and size definitions for cache segment control */
+#define PCACHE_CACHE_SEG_CTRL_OFF (PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX)
+#define PCACHE_CACHE_SEG_CTRL_SIZE (4 * PCACHE_KB)
+
+struct pcache_cache_seg_gen {
+ struct pcache_meta_header header;
+ __u64 gen;
+};
+
+/* Control structure for cache segments */
+struct pcache_cache_seg_ctrl {
+ struct pcache_cache_seg_gen gen[PCACHE_META_INDEX_MAX];
+ __u64 res[64];
+};
+
+#define PCACHE_CACHE_FLAGS_DATA_CRC BIT(0)
+#define PCACHE_CACHE_FLAGS_INIT_DONE BIT(1)
+
+#define PCACHE_CACHE_FLAGS_CACHE_MODE_MASK GENMASK(5, 2)
+#define PCACHE_CACHE_MODE_WRITEBACK 0
+#define PCACHE_CACHE_MODE_WRITETHROUGH 1
+#define PCACHE_CACHE_MODE_WRITEAROUND 2
+#define PCACHE_CACHE_MODE_WRITEONLY 3
+
+#define PCACHE_CACHE_FLAGS_GC_PERCENT_MASK GENMASK(12, 6)
+
+struct pcache_cache_info {
+ struct pcache_meta_header header;
+ __u32 seg_id;
+ __u32 n_segs;
+ __u32 flags;
+ __u32 reserved;
+};
+
+struct pcache_cache_pos {
+ struct pcache_cache_segment *cache_seg;
+ u32 seg_off;
+};
+
+struct pcache_cache_segment {
+ struct pcache_cache *cache;
+ u32 cache_seg_id; /* Index in cache->segments */
+ struct pcache_segment segment;
+ atomic_t refs;
+
+ struct pcache_segment_info cache_seg_info;
+ struct mutex info_lock;
+ u32 info_index;
+
+ spinlock_t gen_lock;
+ u64 gen;
+ u64 gen_seq;
+ u32 gen_index;
+
+ struct pcache_cache_seg_ctrl *cache_seg_ctrl;
+};
+
+/* rbtree for cache entries */
+struct pcache_cache_subtree {
+ struct rb_root root;
+ spinlock_t tree_lock;
+};
+
+struct pcache_cache_tree {
+ struct pcache_cache *cache;
+ u32 n_subtrees;
+ mempool_t key_pool;
+ struct pcache_cache_subtree *subtrees;
+};
+
+extern struct kmem_cache *key_cache;
+
+struct pcache_cache_key {
+ struct pcache_cache_tree *cache_tree;
+ struct pcache_cache_subtree *cache_subtree;
+ struct kref ref;
+ struct rb_node rb_node;
+ struct list_head list_node;
+ u64 off;
+ u32 len;
+ u32 flags;
+ struct pcache_cache_pos cache_pos;
+ u64 seg_gen;
+};
+
+#define PCACHE_CACHE_KEY_FLAGS_EMPTY BIT(0)
+#define PCACHE_CACHE_KEY_FLAGS_CLEAN BIT(1)
+
+struct pcache_cache_key_onmedia {
+ __u64 off;
+ __u32 len;
+ __u32 flags;
+ __u32 cache_seg_id;
+ __u32 cache_seg_off;
+ __u64 seg_gen;
+ __u32 data_crc;
+ __u32 reserved;
+};
+
+struct pcache_cache_kset_onmedia {
+ __u32 crc;
+ union {
+ __u32 key_num;
+ __u32 next_cache_seg_id;
+ };
+ __u64 magic;
+ __u64 flags;
+ struct pcache_cache_key_onmedia data[];
+};
+
+struct pcache_cache {
+ struct pcache_backing_dev *backing_dev;
+ struct pcache_cache_dev *cache_dev;
+ struct pcache_cache_ctrl *cache_ctrl;
+ u64 dev_size;
+
+ struct pcache_cache_data_head __percpu *data_heads;
+
+ spinlock_t key_head_lock;
+ struct pcache_cache_pos key_head;
+ u32 n_ksets;
+ struct pcache_cache_kset *ksets;
+
+ struct mutex key_tail_lock;
+ struct pcache_cache_pos key_tail;
+ u64 key_tail_seq;
+ u32 key_tail_index;
+
+ struct mutex dirty_tail_lock;
+ struct pcache_cache_pos dirty_tail;
+ u64 dirty_tail_seq;
+ u32 dirty_tail_index;
+
+ struct pcache_cache_tree req_key_tree;
+ struct work_struct clean_work;
+
+ struct mutex writeback_lock;
+ char wb_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX];
+ struct pcache_cache_tree writeback_key_tree;
+ struct delayed_work writeback_work;
+ struct {
+ atomic_t pending;
+ u32 advance;
+ int ret;
+ } writeback_ctx;
+
+ char gc_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX];
+ struct delayed_work gc_work;
+ atomic_t gc_errors;
+
+ struct mutex cache_info_lock;
+ struct pcache_cache_info cache_info;
+ struct pcache_cache_info *cache_info_addr;
+ u32 info_index;
+
+ u32 n_segs;
+ unsigned long *seg_map;
+ u32 last_cache_seg;
+ bool cache_full;
+ spinlock_t seg_map_lock;
+ struct pcache_cache_segment *segments;
+};
+
+struct workqueue_struct *cache_get_wq(struct pcache_cache *cache);
+
+struct dm_pcache;
+struct pcache_cache_options {
+ u32 cache_mode:4;
+ u32 data_crc:1;
+};
+int pcache_cache_start(struct dm_pcache *pcache);
+void pcache_cache_stop(struct dm_pcache *pcache);
+
+struct pcache_cache_ctrl {
+ /* Updated by gc_thread */
+ struct pcache_cache_pos_onmedia key_tail_pos[PCACHE_META_INDEX_MAX];
+
+ /* Updated by writeback_thread */
+ struct pcache_cache_pos_onmedia dirty_tail_pos[PCACHE_META_INDEX_MAX];
+};
+
+struct pcache_cache_data_head {
+ struct pcache_cache_pos head_pos;
+};
+
+static inline u16 pcache_cache_get_gc_percent(struct pcache_cache *cache)
+{
+ return FIELD_GET(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, cache->cache_info.flags);
+}
+
+int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent);
+
+/* cache key */
+struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask);
+void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key);
+void cache_key_get(struct pcache_cache_key *key);
+void cache_key_put(struct pcache_cache_key *key);
+int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close);
+void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup);
+int cache_key_decode(struct pcache_cache *cache,
+ struct pcache_cache_key_onmedia *key_onmedia,
+ struct pcache_cache_key *key);
+void cache_pos_advance(struct pcache_cache_pos *pos, u32 len);
+
+#define PCACHE_KSET_FLAGS_LAST BIT(0)
+#define PCACHE_KSET_MAGIC 0x676894a64e164f1aULL
+
+struct pcache_cache_kset {
+ struct pcache_cache *cache;
+ spinlock_t kset_lock;
+ struct delayed_work flush_work;
+ struct pcache_cache_kset_onmedia kset_onmedia;
+};
+
+extern struct pcache_cache_kset_onmedia pcache_empty_kset;
+
+#define SUBTREE_WALK_RET_OK 0
+#define SUBTREE_WALK_RET_ERR 1
+#define SUBTREE_WALK_RET_NEED_KEY 2
+#define SUBTREE_WALK_RET_NEED_REQ 3
+#define SUBTREE_WALK_RET_RESEARCH 4
+
+struct pcache_cache_subtree_walk_ctx {
+ struct pcache_cache_tree *cache_tree;
+ struct rb_node *start_node;
+ struct pcache_request *pcache_req;
+ struct pcache_cache_key *key;
+ u32 req_done;
+ int ret;
+
+ /* pre-allocated key and backing_dev_req */
+ struct pcache_cache_key *pre_alloc_key;
+ struct pcache_backing_dev_req *pre_alloc_req;
+
+ struct list_head *delete_key_list;
+ struct list_head *submit_req_list;
+
+ /*
+ * |--------| key_tmp
+ * |====| key
+ */
+ int (*before)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx);
+
+ /*
+ * |----------| key_tmp
+ * |=====| key
+ */
+ int (*after)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx);
+
+ /*
+ * |----------------| key_tmp
+ * |===========| key
+ */
+ int (*overlap_tail)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx);
+
+ /*
+ * |--------| key_tmp
+ * |==========| key
+ */
+ int (*overlap_head)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx);
+
+ /*
+ * |----| key_tmp
+ * |==========| key
+ */
+ int (*overlap_contain)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx);
+
+ /*
+ * |-----------| key_tmp
+ * |====| key
+ */
+ int (*overlap_contained)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx);
+
+ int (*walk_finally)(struct pcache_cache_subtree_walk_ctx *ctx, int ret);
+ bool (*walk_done)(struct pcache_cache_subtree_walk_ctx *ctx);
+};
+
+int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx);
+struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key,
+ struct rb_node **parentp, struct rb_node ***newp,
+ struct list_head *delete_key_list);
+int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset);
+void clean_fn(struct work_struct *work);
+void kset_flush_fn(struct work_struct *work);
+int cache_replay(struct pcache_cache *cache);
+int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees);
+void cache_tree_clear(struct pcache_cache_tree *cache_tree);
+void cache_tree_exit(struct pcache_cache_tree *cache_tree);
+
+/* cache segments */
+struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache);
+int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id,
+ bool new_cache);
+void cache_seg_get(struct pcache_cache_segment *cache_seg);
+void cache_seg_put(struct pcache_cache_segment *cache_seg);
+void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id);
+
+/* cache request*/
+int pcache_cache_flush(struct pcache_cache *cache);
+void miss_read_end_work_fn(struct work_struct *work);
+int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req);
+
+/* gc */
+void pcache_cache_gc_fn(struct work_struct *work);
+
+/* writeback */
+void cache_writeback_exit(struct pcache_cache *cache);
+int cache_writeback_init(struct pcache_cache *cache);
+void cache_writeback_fn(struct work_struct *work);
+
+/* inline functions */
+static inline struct pcache_cache_subtree *get_subtree(struct pcache_cache_tree *cache_tree, u64 off)
+{
+ if (cache_tree->n_subtrees == 1)
+ return &cache_tree->subtrees[0];
+
+ return &cache_tree->subtrees[off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT];
+}
+
+static inline void *cache_pos_addr(struct pcache_cache_pos *pos)
+{
+ return (pos->cache_seg->segment.data + pos->seg_off);
+}
+
+static inline void *get_key_head_addr(struct pcache_cache *cache)
+{
+ return cache_pos_addr(&cache->key_head);
+}
+
+static inline u32 get_kset_id(struct pcache_cache *cache, u64 off)
+{
+ u32 kset_id;
+
+ div_u64_rem(off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT, cache->n_ksets, &kset_id);
+
+ return kset_id;
+}
+
+static inline struct pcache_cache_kset *get_kset(struct pcache_cache *cache, u32 kset_id)
+{
+ return (void *)cache->ksets + PCACHE_KSET_SIZE * kset_id;
+}
+
+static inline struct pcache_cache_data_head *get_data_head(struct pcache_cache *cache)
+{
+ return this_cpu_ptr(cache->data_heads);
+}
+
+static inline bool cache_key_empty(struct pcache_cache_key *key)
+{
+ return key->flags & PCACHE_CACHE_KEY_FLAGS_EMPTY;
+}
+
+static inline bool cache_key_clean(struct pcache_cache_key *key)
+{
+ return key->flags & PCACHE_CACHE_KEY_FLAGS_CLEAN;
+}
+
+static inline void cache_pos_copy(struct pcache_cache_pos *dst, struct pcache_cache_pos *src)
+{
+ memcpy(dst, src, sizeof(struct pcache_cache_pos));
+}
+
+/**
+ * cache_seg_is_ctrl_seg - Checks if a cache segment is a cache ctrl segment.
+ * @cache_seg_id: ID of the cache segment.
+ *
+ * Returns true if the cache segment ID corresponds to a cache ctrl segment.
+ *
+ * Note: We extend the segment control of the first cache segment
+ * (cache segment ID 0) to serve as the cache control (pcache_cache_ctrl)
+ * for the entire PCACHE cache. This function determines whether the given
+ * cache segment is the one storing the pcache_cache_ctrl information.
+ */
+static inline bool cache_seg_is_ctrl_seg(u32 cache_seg_id)
+{
+ return (cache_seg_id == 0);
+}
+
+/**
+ * cache_key_cutfront - Cuts a specified length from the front of a cache key.
+ * @key: Pointer to pcache_cache_key structure.
+ * @cut_len: Length to cut from the front.
+ *
+ * Advances the cache key position by cut_len and adjusts offset and length accordingly.
+ */
+static inline void cache_key_cutfront(struct pcache_cache_key *key, u32 cut_len)
+{
+ if (key->cache_pos.cache_seg)
+ cache_pos_advance(&key->cache_pos, cut_len);
+
+ key->off += cut_len;
+ key->len -= cut_len;
+}
+
+/**
+ * cache_key_cutback - Cuts a specified length from the back of a cache key.
+ * @key: Pointer to pcache_cache_key structure.
+ * @cut_len: Length to cut from the back.
+ *
+ * Reduces the length of the cache key by cut_len.
+ */
+static inline void cache_key_cutback(struct pcache_cache_key *key, u32 cut_len)
+{
+ key->len -= cut_len;
+}
+
+static inline void cache_key_delete(struct pcache_cache_key *key)
+{
+ struct pcache_cache_subtree *cache_subtree;
+
+ cache_subtree = key->cache_subtree;
+ BUG_ON(!cache_subtree);
+
+ rb_erase(&key->rb_node, &cache_subtree->root);
+ key->flags = 0;
+ cache_key_put(key);
+}
+
+static inline bool cache_data_crc_on(struct pcache_cache *cache)
+{
+ return (cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC);
+}
+
+static inline u32 cache_mode_get(struct pcache_cache *cache)
+{
+ return FIELD_GET(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache->cache_info.flags);
+}
+
+static inline void cache_mode_set(struct pcache_cache *cache, u32 cache_mode)
+{
+ cache->cache_info.flags &= ~PCACHE_CACHE_FLAGS_CACHE_MODE_MASK;
+ cache->cache_info.flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache_mode);
+}
+
+/**
+ * cache_key_data_crc - Calculates CRC for data in a cache key.
+ * @key: Pointer to the pcache_cache_key structure.
+ *
+ * Returns the CRC-32 checksum of the data within the cache key's position.
+ */
+static inline u32 cache_key_data_crc(struct pcache_cache_key *key)
+{
+ void *data;
+
+ data = cache_pos_addr(&key->cache_pos);
+
+ return crc32c(PCACHE_CRC_SEED, data, key->len);
+}
+
+static inline u32 cache_kset_crc(struct pcache_cache_kset_onmedia *kset_onmedia)
+{
+ u32 crc_size;
+
+ if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST)
+ crc_size = sizeof(struct pcache_cache_kset_onmedia) - 4;
+ else
+ crc_size = struct_size(kset_onmedia, data, kset_onmedia->key_num) - 4;
+
+ return crc32c(PCACHE_CRC_SEED, (void *)kset_onmedia + 4, crc_size);
+}
+
+static inline u32 get_kset_onmedia_size(struct pcache_cache_kset_onmedia *kset_onmedia)
+{
+ return struct_size_t(struct pcache_cache_kset_onmedia, data, kset_onmedia->key_num);
+}
+
+/**
+ * cache_seg_remain - Computes remaining space in a cache segment.
+ * @pos: Pointer to pcache_cache_pos structure.
+ *
+ * Returns the amount of remaining space in the segment data starting from
+ * the current position offset.
+ */
+static inline u32 cache_seg_remain(struct pcache_cache_pos *pos)
+{
+ struct pcache_cache_segment *cache_seg;
+ struct pcache_segment *segment;
+ u32 seg_remain;
+
+ cache_seg = pos->cache_seg;
+ segment = &cache_seg->segment;
+ seg_remain = segment->data_size - pos->seg_off;
+
+ return seg_remain;
+}
+
+/**
+ * cache_key_invalid - Checks if a cache key is invalid.
+ * @key: Pointer to pcache_cache_key structure.
+ *
+ * Returns true if the cache key is invalid due to its generation being
+ * less than the generation of its segment; otherwise returns false.
+ *
+ * When the GC (garbage collection) thread identifies a segment
+ * as reclaimable, it increments the segment's generation (gen). However,
+ * it does not immediately remove all related cache keys. When accessing
+ * such a cache key, this function can be used to determine if the cache
+ * key has already become invalid.
+ */
+static inline bool cache_key_invalid(struct pcache_cache_key *key)
+{
+ if (cache_key_empty(key))
+ return false;
+
+ return (key->seg_gen < key->cache_pos.cache_seg->gen);
+}
+
+/**
+ * cache_key_lstart - Retrieves the logical start offset of a cache key.
+ * @key: Pointer to pcache_cache_key structure.
+ *
+ * Returns the logical start offset for the cache key.
+ */
+static inline u64 cache_key_lstart(struct pcache_cache_key *key)
+{
+ return key->off;
+}
+
+/**
+ * cache_key_lend - Retrieves the logical end offset of a cache key.
+ * @key: Pointer to pcache_cache_key structure.
+ *
+ * Returns the logical end offset for the cache key.
+ */
+static inline u64 cache_key_lend(struct pcache_cache_key *key)
+{
+ return key->off + key->len;
+}
+
+static inline void cache_key_copy(struct pcache_cache_key *key_dst, struct pcache_cache_key *key_src)
+{
+ key_dst->off = key_src->off;
+ key_dst->len = key_src->len;
+ key_dst->seg_gen = key_src->seg_gen;
+ key_dst->cache_tree = key_src->cache_tree;
+ key_dst->cache_subtree = key_src->cache_subtree;
+ key_dst->flags = key_src->flags;
+
+ cache_pos_copy(&key_dst->cache_pos, &key_src->cache_pos);
+}
+
+/**
+ * cache_pos_onmedia_crc - Calculates the CRC for an on-media cache position.
+ * @pos_om: Pointer to pcache_cache_pos_onmedia structure.
+ *
+ * Calculates the CRC-32 checksum of the position, excluding the first 4 bytes.
+ * Returns the computed CRC value.
+ */
+static inline u32 cache_pos_onmedia_crc(struct pcache_cache_pos_onmedia *pos_om)
+{
+ return pcache_meta_crc(&pos_om->header, sizeof(struct pcache_cache_pos_onmedia));
+}
+
+void cache_pos_encode(struct pcache_cache *cache,
+ struct pcache_cache_pos_onmedia *pos_onmedia,
+ struct pcache_cache_pos *pos, u64 seq, u32 *index);
+int cache_pos_decode(struct pcache_cache *cache,
+ struct pcache_cache_pos_onmedia *pos_onmedia,
+ struct pcache_cache_pos *pos, u64 *seq, u32 *index);
+
+static inline void cache_encode_key_tail(struct pcache_cache *cache)
+{
+ cache_pos_encode(cache, cache->cache_ctrl->key_tail_pos,
+ &cache->key_tail, ++cache->key_tail_seq,
+ &cache->key_tail_index);
+}
+
+static inline int cache_decode_key_tail(struct pcache_cache *cache)
+{
+ return cache_pos_decode(cache, cache->cache_ctrl->key_tail_pos,
+ &cache->key_tail, &cache->key_tail_seq,
+ &cache->key_tail_index);
+}
+
+static inline void cache_encode_dirty_tail(struct pcache_cache *cache)
+{
+ cache_pos_encode(cache, cache->cache_ctrl->dirty_tail_pos,
+ &cache->dirty_tail, ++cache->dirty_tail_seq,
+ &cache->dirty_tail_index);
+}
+
+static inline int cache_decode_dirty_tail(struct pcache_cache *cache)
+{
+ return cache_pos_decode(cache, cache->cache_ctrl->dirty_tail_pos,
+ &cache->dirty_tail, &cache->dirty_tail_seq,
+ &cache->dirty_tail_index);
+}
+
+int pcache_cache_init(void);
+void pcache_cache_exit(void);
+#endif /* _PCACHE_CACHE_H */
diff --git a/drivers/md/dm-pcache/cache_dev.c b/drivers/md/dm-pcache/cache_dev.c
new file mode 100644
index 000000000000..ece689e6ce59
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_dev.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/blkdev.h>
+#include <linux/dax.h>
+#include <linux/vmalloc.h>
+#include <linux/parser.h>
+
+#include "cache_dev.h"
+#include "backing_dev.h"
+#include "cache.h"
+#include "dm_pcache.h"
+
+static void cache_dev_dax_exit(struct pcache_cache_dev *cache_dev)
+{
+ if (cache_dev->use_vmap)
+ vunmap(cache_dev->mapping);
+}
+
+static int build_vmap(struct dax_device *dax_dev, long total_pages, void **vaddr)
+{
+ struct page **pages;
+ long i = 0, chunk;
+ unsigned long pfn;
+ int ret;
+
+ pages = vmalloc_array(total_pages, sizeof(struct page *));
+ if (!pages)
+ return -ENOMEM;
+
+ do {
+ chunk = dax_direct_access(dax_dev, i, total_pages - i,
+ DAX_ACCESS, NULL, &pfn);
+ if (chunk <= 0) {
+ ret = chunk ? chunk : -EINVAL;
+ goto out_free;
+ }
+
+ if (!pfn_valid(pfn)) {
+ ret = -EOPNOTSUPP;
+ goto out_free;
+ }
+
+ while (chunk-- && i < total_pages) {
+ pages[i++] = pfn_to_page(pfn);
+ pfn++;
+ if (!(i & 15))
+ cond_resched();
+ }
+ } while (i < total_pages);
+
+ *vaddr = vmap(pages, total_pages, VM_MAP, PAGE_KERNEL);
+ if (!*vaddr) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ ret = 0;
+
+out_free:
+ vfree(pages);
+ return ret;
+}
+
+static int cache_dev_dax_init(struct pcache_cache_dev *cache_dev)
+{
+ struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
+ struct dax_device *dax_dev;
+ long total_pages, mapped_pages;
+ u64 bdev_size;
+ void *vaddr;
+ int ret;
+ int id;
+ unsigned long pfn;
+
+ dax_dev = cache_dev->dm_dev->dax_dev;
+ /* total size check */
+ bdev_size = bdev_nr_bytes(cache_dev->dm_dev->bdev);
+ if (bdev_size < PCACHE_CACHE_DEV_SIZE_MIN) {
+ pcache_dev_err(pcache, "dax device is too small, required at least %llu",
+ PCACHE_CACHE_DEV_SIZE_MIN);
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ total_pages = bdev_size >> PAGE_SHIFT;
+ /* attempt: direct-map the whole range */
+ id = dax_read_lock();
+ mapped_pages = dax_direct_access(dax_dev, 0, total_pages,
+ DAX_ACCESS, &vaddr, &pfn);
+ if (mapped_pages < 0) {
+ pcache_dev_err(pcache, "dax_direct_access failed: %ld\n", mapped_pages);
+ ret = mapped_pages;
+ goto unlock;
+ }
+
+ if (!pfn_valid(pfn)) {
+ ret = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ if (mapped_pages == total_pages) {
+ /* success: contiguous direct mapping */
+ cache_dev->mapping = vaddr;
+ } else {
+ /* need vmap fallback */
+ ret = build_vmap(dax_dev, total_pages, &vaddr);
+ if (ret) {
+ pcache_dev_err(pcache, "vmap fallback failed: %d\n", ret);
+ goto unlock;
+ }
+
+ cache_dev->mapping = vaddr;
+ cache_dev->use_vmap = true;
+ }
+ dax_read_unlock(id);
+
+ return 0;
+unlock:
+ dax_read_unlock(id);
+out:
+ return ret;
+}
+
+void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size)
+{
+ memset(pos, 0, size);
+ dax_flush(cache_dev->dm_dev->dax_dev, pos, size);
+}
+
+static int sb_read(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
+{
+ struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev);
+
+ if (copy_mc_to_kernel(sb, sb_addr, sizeof(struct pcache_sb)))
+ return -EIO;
+
+ return 0;
+}
+
+static void sb_write(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
+{
+ struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev);
+
+ memcpy_flushcache(sb_addr, sb, sizeof(struct pcache_sb));
+ pmem_wmb();
+}
+
+static int sb_init(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
+{
+ struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
+ u64 nr_segs;
+ u64 cache_dev_size;
+ u64 magic;
+ u32 flags = 0;
+
+ magic = le64_to_cpu(sb->magic);
+ if (magic)
+ return -EEXIST;
+
+ cache_dev_size = bdev_nr_bytes(file_bdev(cache_dev->dm_dev->bdev_file));
+ if (cache_dev_size < PCACHE_CACHE_DEV_SIZE_MIN) {
+ pcache_dev_err(pcache, "dax device is too small, required at least %llu",
+ PCACHE_CACHE_DEV_SIZE_MIN);
+ return -ENOSPC;
+ }
+
+ nr_segs = (cache_dev_size - PCACHE_SEGMENTS_OFF) / ((PCACHE_SEG_SIZE));
+
+#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN)
+ flags |= PCACHE_SB_F_BIGENDIAN;
+#endif
+ sb->flags = cpu_to_le32(flags);
+ sb->magic = cpu_to_le64(PCACHE_MAGIC);
+ sb->seg_num = cpu_to_le32(nr_segs);
+ sb->crc = cpu_to_le32(crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4));
+
+ cache_dev_zero_range(cache_dev, CACHE_DEV_CACHE_INFO(cache_dev),
+ PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX +
+ PCACHE_CACHE_CTRL_SIZE);
+
+ return 0;
+}
+
+static int sb_validate(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
+{
+ struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
+ u32 flags;
+ u32 crc;
+
+ if (le64_to_cpu(sb->magic) != PCACHE_MAGIC) {
+ pcache_dev_err(pcache, "unexpected magic: %llx\n",
+ le64_to_cpu(sb->magic));
+ return -EINVAL;
+ }
+
+ crc = crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4);
+ if (crc != le32_to_cpu(sb->crc)) {
+ pcache_dev_err(pcache, "corrupted sb: %u, expected: %u\n", crc, le32_to_cpu(sb->crc));
+ return -EINVAL;
+ }
+
+ flags = le32_to_cpu(sb->flags);
+#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN)
+ if (!(flags & PCACHE_SB_F_BIGENDIAN)) {
+ pcache_dev_err(pcache, "cache_dev is not big endian\n");
+ return -EINVAL;
+ }
+#else
+ if (flags & PCACHE_SB_F_BIGENDIAN) {
+ pcache_dev_err(pcache, "cache_dev is big endian\n");
+ return -EINVAL;
+ }
+#endif
+ return 0;
+}
+
+static int cache_dev_init(struct pcache_cache_dev *cache_dev, u32 seg_num)
+{
+ cache_dev->seg_num = seg_num;
+ cache_dev->seg_bitmap = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL);
+ if (!cache_dev->seg_bitmap)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void cache_dev_exit(struct pcache_cache_dev *cache_dev)
+{
+ kvfree(cache_dev->seg_bitmap);
+}
+
+void cache_dev_stop(struct dm_pcache *pcache)
+{
+ struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
+
+ cache_dev_exit(cache_dev);
+ cache_dev_dax_exit(cache_dev);
+}
+
+int cache_dev_start(struct dm_pcache *pcache)
+{
+ struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
+ struct pcache_sb sb;
+ bool format = false;
+ int ret;
+
+ mutex_init(&cache_dev->seg_lock);
+
+ ret = cache_dev_dax_init(cache_dev);
+ if (ret) {
+ pcache_dev_err(pcache, "failed to init cache_dev %s via dax way: %d.",
+ cache_dev->dm_dev->name, ret);
+ goto err;
+ }
+
+ ret = sb_read(cache_dev, &sb);
+ if (ret)
+ goto dax_release;
+
+ if (le64_to_cpu(sb.magic) == 0) {
+ format = true;
+ ret = sb_init(cache_dev, &sb);
+ if (ret < 0)
+ goto dax_release;
+ }
+
+ ret = sb_validate(cache_dev, &sb);
+ if (ret)
+ goto dax_release;
+
+ cache_dev->sb_flags = le32_to_cpu(sb.flags);
+ ret = cache_dev_init(cache_dev, le32_to_cpu(sb.seg_num));
+ if (ret)
+ goto dax_release;
+
+ if (format)
+ sb_write(cache_dev, &sb);
+
+ return 0;
+
+dax_release:
+ cache_dev_dax_exit(cache_dev);
+err:
+ return ret;
+}
+
+int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id)
+{
+ int ret;
+
+ mutex_lock(&cache_dev->seg_lock);
+ *seg_id = find_next_zero_bit(cache_dev->seg_bitmap, cache_dev->seg_num, 0);
+ if (*seg_id == cache_dev->seg_num) {
+ ret = -ENOSPC;
+ goto unlock;
+ }
+
+ __set_bit(*seg_id, cache_dev->seg_bitmap);
+ ret = 0;
+unlock:
+ mutex_unlock(&cache_dev->seg_lock);
+ return ret;
+}
diff --git a/drivers/md/dm-pcache/cache_dev.h b/drivers/md/dm-pcache/cache_dev.h
new file mode 100644
index 000000000000..6251eb4ebe96
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_dev.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _PCACHE_CACHE_DEV_H
+#define _PCACHE_CACHE_DEV_H
+
+#include <linux/device.h>
+#include <linux/device-mapper.h>
+
+#include "pcache_internal.h"
+
+#define PCACHE_MAGIC 0x65B05EFA96C596EFULL
+
+#define PCACHE_SB_OFF (4 * PCACHE_KB)
+#define PCACHE_SB_SIZE (4 * PCACHE_KB)
+
+#define PCACHE_CACHE_INFO_OFF (PCACHE_SB_OFF + PCACHE_SB_SIZE)
+#define PCACHE_CACHE_INFO_SIZE (4 * PCACHE_KB)
+
+#define PCACHE_CACHE_CTRL_OFF (PCACHE_CACHE_INFO_OFF + (PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX))
+#define PCACHE_CACHE_CTRL_SIZE (4 * PCACHE_KB)
+
+#define PCACHE_SEGMENTS_OFF (PCACHE_CACHE_CTRL_OFF + PCACHE_CACHE_CTRL_SIZE)
+#define PCACHE_SEG_INFO_SIZE (4 * PCACHE_KB)
+
+#define PCACHE_CACHE_DEV_SIZE_MIN (512 * PCACHE_MB) /* 512 MB */
+#define PCACHE_SEG_SIZE (16 * PCACHE_MB) /* Size of each PCACHE segment (16 MB) */
+
+#define CACHE_DEV_SB(cache_dev) ((struct pcache_sb *)(cache_dev->mapping + PCACHE_SB_OFF))
+#define CACHE_DEV_CACHE_INFO(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_INFO_OFF)
+#define CACHE_DEV_CACHE_CTRL(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_CTRL_OFF)
+#define CACHE_DEV_SEGMENTS(cache_dev) ((void *)cache_dev->mapping + PCACHE_SEGMENTS_OFF)
+#define CACHE_DEV_SEGMENT(cache_dev, id) ((void *)CACHE_DEV_SEGMENTS(cache_dev) + (u64)id * PCACHE_SEG_SIZE)
+
+/*
+ * PCACHE SB flags configured during formatting
+ *
+ * The PCACHE_SB_F_xxx flags define registration requirements based on cache_dev
+ * formatting. For a machine to register a cache_dev:
+ * - PCACHE_SB_F_BIGENDIAN: Requires a big-endian machine.
+ */
+#define PCACHE_SB_F_BIGENDIAN BIT(0)
+
+struct pcache_sb {
+ __le32 crc;
+ __le32 flags;
+ __le64 magic;
+
+ __le32 seg_num;
+};
+
+struct pcache_cache_dev {
+ u32 sb_flags;
+ u32 seg_num;
+ void *mapping;
+ bool use_vmap;
+
+ struct dm_dev *dm_dev;
+
+ struct mutex seg_lock;
+ unsigned long *seg_bitmap;
+};
+
+struct dm_pcache;
+int cache_dev_start(struct dm_pcache *pcache);
+void cache_dev_stop(struct dm_pcache *pcache);
+
+void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size);
+
+int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id);
+
+#endif /* _PCACHE_CACHE_DEV_H */
diff --git a/drivers/md/dm-pcache/cache_gc.c b/drivers/md/dm-pcache/cache_gc.c
new file mode 100644
index 000000000000..94f8b276a021
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_gc.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include "cache.h"
+#include "backing_dev.h"
+#include "cache_dev.h"
+#include "dm_pcache.h"
+
+/**
+ * cache_key_gc - Releases the reference of a cache key segment.
+ * @cache: Pointer to the pcache_cache structure.
+ * @key: Pointer to the cache key to be garbage collected.
+ *
+ * This function decrements the reference count of the cache segment
+ * associated with the given key. If the reference count drops to zero,
+ * the segment may be invalidated and reused.
+ */
+static void cache_key_gc(struct pcache_cache *cache, struct pcache_cache_key *key)
+{
+ cache_seg_put(key->cache_pos.cache_seg);
+}
+
+static bool need_gc(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail, struct pcache_cache_pos *key_tail)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_kset_onmedia *kset_onmedia;
+ void *dirty_addr, *key_addr;
+ u32 segs_used, segs_gc_threshold, to_copy;
+ int ret;
+
+ dirty_addr = cache_pos_addr(dirty_tail);
+ key_addr = cache_pos_addr(key_tail);
+ if (dirty_addr == key_addr) {
+ pcache_dev_debug(pcache, "key tail is equal to dirty tail: %u:%u\n",
+ dirty_tail->cache_seg->cache_seg_id,
+ dirty_tail->seg_off);
+ return false;
+ }
+
+ kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf;
+
+ to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - key_tail->seg_off);
+ ret = copy_mc_to_kernel(kset_onmedia, key_addr, to_copy);
+ if (ret) {
+ pcache_dev_err(pcache, "error to read kset: %d", ret);
+ return false;
+ }
+
+ /* Check if kset_onmedia is corrupted */
+ if (kset_onmedia->magic != PCACHE_KSET_MAGIC) {
+ pcache_dev_debug(pcache, "gc error: magic is not as expected. key_tail: %u:%u magic: %llx, expected: %llx\n",
+ key_tail->cache_seg->cache_seg_id, key_tail->seg_off,
+ kset_onmedia->magic, PCACHE_KSET_MAGIC);
+ return false;
+ }
+
+ /* Verify the CRC of the kset_onmedia */
+ if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) {
+ pcache_dev_debug(pcache, "gc error: crc is not as expected. crc: %x, expected: %x\n",
+ cache_kset_crc(kset_onmedia), kset_onmedia->crc);
+ return false;
+ }
+
+ segs_used = bitmap_weight(cache->seg_map, cache->n_segs);
+ segs_gc_threshold = cache->n_segs * pcache_cache_get_gc_percent(cache) / 100;
+ if (segs_used < segs_gc_threshold) {
+ pcache_dev_debug(pcache, "segs_used: %u, segs_gc_threshold: %u\n", segs_used, segs_gc_threshold);
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * last_kset_gc - Advances the garbage collection for the last kset.
+ * @cache: Pointer to the pcache_cache structure.
+ * @kset_onmedia: Pointer to the kset_onmedia structure for the last kset.
+ */
+static void last_kset_gc(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_segment *cur_seg, *next_seg;
+
+ cur_seg = cache->key_tail.cache_seg;
+
+ next_seg = &cache->segments[kset_onmedia->next_cache_seg_id];
+
+ mutex_lock(&cache->key_tail_lock);
+ cache->key_tail.cache_seg = next_seg;
+ cache->key_tail.seg_off = 0;
+ cache_encode_key_tail(cache);
+ mutex_unlock(&cache->key_tail_lock);
+
+ pcache_dev_debug(pcache, "gc advance kset seg: %u\n", cur_seg->cache_seg_id);
+
+ spin_lock(&cache->seg_map_lock);
+ __clear_bit(cur_seg->cache_seg_id, cache->seg_map);
+ spin_unlock(&cache->seg_map_lock);
+}
+
+void pcache_cache_gc_fn(struct work_struct *work)
+{
+ struct pcache_cache *cache = container_of(work, struct pcache_cache, gc_work.work);
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_pos dirty_tail, key_tail;
+ struct pcache_cache_kset_onmedia *kset_onmedia;
+ struct pcache_cache_key_onmedia *key_onmedia;
+ struct pcache_cache_key *key;
+ int ret;
+ int i;
+
+ kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf;
+
+ while (true) {
+ if (pcache_is_stopping(pcache) || atomic_read(&cache->gc_errors))
+ return;
+
+ /* Get new tail positions */
+ mutex_lock(&cache->dirty_tail_lock);
+ cache_pos_copy(&dirty_tail, &cache->dirty_tail);
+ mutex_unlock(&cache->dirty_tail_lock);
+
+ mutex_lock(&cache->key_tail_lock);
+ cache_pos_copy(&key_tail, &cache->key_tail);
+ mutex_unlock(&cache->key_tail_lock);
+
+ if (!need_gc(cache, &dirty_tail, &key_tail))
+ break;
+
+ if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) {
+ /* Don't move to the next segment if dirty_tail has not moved */
+ if (dirty_tail.cache_seg == key_tail.cache_seg)
+ break;
+
+ last_kset_gc(cache, kset_onmedia);
+ continue;
+ }
+
+ for (i = 0; i < kset_onmedia->key_num; i++) {
+ struct pcache_cache_key key_tmp = { 0 };
+
+ key_onmedia = &kset_onmedia->data[i];
+
+ key = &key_tmp;
+ cache_key_init(&cache->req_key_tree, key);
+
+ ret = cache_key_decode(cache, key_onmedia, key);
+ if (ret) {
+ /* return without re-arm gc work, and prevent future
+ * gc, because we can't retry the partial-gc-ed kset
+ */
+ atomic_inc(&cache->gc_errors);
+ pcache_dev_err(pcache, "failed to decode cache key in gc\n");
+ return;
+ }
+
+ cache_key_gc(cache, key);
+ }
+
+ pcache_dev_debug(pcache, "gc advance: %u:%u %u\n",
+ key_tail.cache_seg->cache_seg_id,
+ key_tail.seg_off,
+ get_kset_onmedia_size(kset_onmedia));
+
+ mutex_lock(&cache->key_tail_lock);
+ cache_pos_advance(&cache->key_tail, get_kset_onmedia_size(kset_onmedia));
+ cache_encode_key_tail(cache);
+ mutex_unlock(&cache->key_tail_lock);
+ }
+
+ queue_delayed_work(cache_get_wq(cache), &cache->gc_work, PCACHE_CACHE_GC_INTERVAL);
+}
diff --git a/drivers/md/dm-pcache/cache_key.c b/drivers/md/dm-pcache/cache_key.c
new file mode 100644
index 000000000000..2b77e121f89b
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_key.c
@@ -0,0 +1,888 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include "cache.h"
+#include "backing_dev.h"
+#include "cache_dev.h"
+#include "dm_pcache.h"
+
+struct pcache_cache_kset_onmedia pcache_empty_kset = { 0 };
+
+void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key)
+{
+ kref_init(&key->ref);
+ key->cache_tree = cache_tree;
+ INIT_LIST_HEAD(&key->list_node);
+ RB_CLEAR_NODE(&key->rb_node);
+}
+
+struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask)
+{
+ struct pcache_cache_key *key;
+
+ key = mempool_alloc(&cache_tree->key_pool, gfp_mask);
+ if (!key)
+ return NULL;
+
+ memset(key, 0, sizeof(struct pcache_cache_key));
+ cache_key_init(cache_tree, key);
+
+ return key;
+}
+
+/**
+ * cache_key_get - Increment the reference count of a cache key.
+ * @key: Pointer to the pcache_cache_key structure.
+ *
+ * This function increments the reference count of the specified cache key,
+ * ensuring that it is not freed while still in use.
+ */
+void cache_key_get(struct pcache_cache_key *key)
+{
+ kref_get(&key->ref);
+}
+
+/**
+ * cache_key_destroy - Free a cache key structure when its reference count drops to zero.
+ * @ref: Pointer to the kref structure.
+ *
+ * This function is called when the reference count of the cache key reaches zero.
+ * It frees the allocated cache key back to the slab cache.
+ */
+static void cache_key_destroy(struct kref *ref)
+{
+ struct pcache_cache_key *key = container_of(ref, struct pcache_cache_key, ref);
+ struct pcache_cache_tree *cache_tree = key->cache_tree;
+
+ mempool_free(key, &cache_tree->key_pool);
+}
+
+void cache_key_put(struct pcache_cache_key *key)
+{
+ kref_put(&key->ref, cache_key_destroy);
+}
+
+void cache_pos_advance(struct pcache_cache_pos *pos, u32 len)
+{
+ /* Ensure enough space remains in the current segment */
+ BUG_ON(cache_seg_remain(pos) < len);
+
+ pos->seg_off += len;
+}
+
+static void cache_key_encode(struct pcache_cache *cache,
+ struct pcache_cache_key_onmedia *key_onmedia,
+ struct pcache_cache_key *key)
+{
+ key_onmedia->off = key->off;
+ key_onmedia->len = key->len;
+
+ key_onmedia->cache_seg_id = key->cache_pos.cache_seg->cache_seg_id;
+ key_onmedia->cache_seg_off = key->cache_pos.seg_off;
+
+ key_onmedia->seg_gen = key->seg_gen;
+ key_onmedia->flags = key->flags;
+
+ if (cache_data_crc_on(cache))
+ key_onmedia->data_crc = cache_key_data_crc(key);
+}
+
+int cache_key_decode(struct pcache_cache *cache,
+ struct pcache_cache_key_onmedia *key_onmedia,
+ struct pcache_cache_key *key)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+
+ key->off = key_onmedia->off;
+ key->len = key_onmedia->len;
+
+ key->cache_pos.cache_seg = &cache->segments[key_onmedia->cache_seg_id];
+ key->cache_pos.seg_off = key_onmedia->cache_seg_off;
+
+ key->seg_gen = key_onmedia->seg_gen;
+ key->flags = key_onmedia->flags;
+
+ if (cache_data_crc_on(cache) &&
+ key_onmedia->data_crc != cache_key_data_crc(key)) {
+ pcache_dev_err(pcache, "key: %llu:%u seg %u:%u data_crc error: %x, expected: %x\n",
+ key->off, key->len, key->cache_pos.cache_seg->cache_seg_id,
+ key->cache_pos.seg_off, cache_key_data_crc(key), key_onmedia->data_crc);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void append_last_kset(struct pcache_cache *cache, u32 next_seg)
+{
+ struct pcache_cache_kset_onmedia kset_onmedia = { 0 };
+
+ kset_onmedia.flags |= PCACHE_KSET_FLAGS_LAST;
+ kset_onmedia.next_cache_seg_id = next_seg;
+ kset_onmedia.magic = PCACHE_KSET_MAGIC;
+ kset_onmedia.crc = cache_kset_crc(&kset_onmedia);
+
+ memcpy_flushcache(get_key_head_addr(cache), &kset_onmedia, sizeof(struct pcache_cache_kset_onmedia));
+ pmem_wmb();
+ cache_pos_advance(&cache->key_head, sizeof(struct pcache_cache_kset_onmedia));
+}
+
+int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset)
+{
+ struct pcache_cache_kset_onmedia *kset_onmedia;
+ u32 kset_onmedia_size;
+ int ret;
+
+ kset_onmedia = &kset->kset_onmedia;
+
+ if (!kset_onmedia->key_num)
+ return 0;
+
+ kset_onmedia_size = struct_size(kset_onmedia, data, kset_onmedia->key_num);
+
+ spin_lock(&cache->key_head_lock);
+again:
+ /* Reserve space for the last kset */
+ if (cache_seg_remain(&cache->key_head) < kset_onmedia_size + sizeof(struct pcache_cache_kset_onmedia)) {
+ struct pcache_cache_segment *next_seg;
+
+ next_seg = get_cache_segment(cache);
+ if (!next_seg) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* clear outdated kset in next seg */
+ memcpy_flushcache(next_seg->segment.data, &pcache_empty_kset,
+ sizeof(struct pcache_cache_kset_onmedia));
+ append_last_kset(cache, next_seg->cache_seg_id);
+ cache->key_head.cache_seg = next_seg;
+ cache->key_head.seg_off = 0;
+ goto again;
+ }
+
+ kset_onmedia->magic = PCACHE_KSET_MAGIC;
+ kset_onmedia->crc = cache_kset_crc(kset_onmedia);
+
+ /* clear outdated kset after current kset */
+ memcpy_flushcache(get_key_head_addr(cache) + kset_onmedia_size, &pcache_empty_kset,
+ sizeof(struct pcache_cache_kset_onmedia));
+ /* write current kset into segment */
+ memcpy_flushcache(get_key_head_addr(cache), kset_onmedia, kset_onmedia_size);
+ pmem_wmb();
+
+ /* reset kset_onmedia */
+ memset(kset_onmedia, 0, sizeof(struct pcache_cache_kset_onmedia));
+ cache_pos_advance(&cache->key_head, kset_onmedia_size);
+
+ ret = 0;
+out:
+ spin_unlock(&cache->key_head_lock);
+
+ return ret;
+}
+
+/**
+ * cache_key_append - Append a cache key to the related kset.
+ * @cache: Pointer to the pcache_cache structure.
+ * @key: Pointer to the cache key structure to append.
+ * @force_close: Need to close current kset if true.
+ *
+ * This function appends a cache key to the appropriate kset. If the kset
+ * is full, it closes the kset. If not, it queues a flush work to write
+ * the kset to media.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close)
+{
+ struct pcache_cache_kset *kset;
+ struct pcache_cache_kset_onmedia *kset_onmedia;
+ struct pcache_cache_key_onmedia *key_onmedia;
+ u32 kset_id = get_kset_id(cache, key->off);
+ int ret = 0;
+
+ kset = get_kset(cache, kset_id);
+ kset_onmedia = &kset->kset_onmedia;
+
+ spin_lock(&kset->kset_lock);
+ key_onmedia = &kset_onmedia->data[kset_onmedia->key_num];
+ cache_key_encode(cache, key_onmedia, key);
+
+ /* Check if the current kset has reached the maximum number of keys */
+ if (++kset_onmedia->key_num == PCACHE_KSET_KEYS_MAX || force_close) {
+ /* If full, close the kset */
+ ret = cache_kset_close(cache, kset);
+ if (ret) {
+ kset_onmedia->key_num--;
+ goto out;
+ }
+ } else {
+ /* If not full, queue a delayed work to flush the kset */
+ queue_delayed_work(cache_get_wq(cache), &kset->flush_work, 1 * HZ);
+ }
+out:
+ spin_unlock(&kset->kset_lock);
+
+ return ret;
+}
+
+/**
+ * cache_subtree_walk - Traverse the cache tree.
+ * @ctx: Pointer to the context structure for traversal.
+ *
+ * This function traverses the cache tree starting from the specified node.
+ * It calls the appropriate callback functions based on the relationships
+ * between the keys in the cache tree.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ struct pcache_cache_key *key_tmp, *key;
+ struct rb_node *node_tmp;
+ int ret = SUBTREE_WALK_RET_OK;
+
+ key = ctx->key;
+ node_tmp = ctx->start_node;
+
+ while (node_tmp) {
+ if (ctx->walk_done && ctx->walk_done(ctx))
+ break;
+
+ key_tmp = CACHE_KEY(node_tmp);
+ /*
+ * If key_tmp ends before the start of key, continue to the next node.
+ * |----------|
+ * |=====|
+ */
+ if (cache_key_lend(key_tmp) <= cache_key_lstart(key)) {
+ if (ctx->after) {
+ ret = ctx->after(key, key_tmp, ctx);
+ if (ret)
+ goto out;
+ }
+ goto next;
+ }
+
+ /*
+ * If key_tmp starts after the end of key, stop traversing.
+ * |--------|
+ * |====|
+ */
+ if (cache_key_lstart(key_tmp) >= cache_key_lend(key)) {
+ if (ctx->before) {
+ ret = ctx->before(key, key_tmp, ctx);
+ if (ret)
+ goto out;
+ }
+ break;
+ }
+
+ /* Handle overlapping keys */
+ if (cache_key_lstart(key_tmp) >= cache_key_lstart(key)) {
+ /*
+ * If key_tmp encompasses key.
+ * |----------------| key_tmp
+ * |===========| key
+ */
+ if (cache_key_lend(key_tmp) >= cache_key_lend(key)) {
+ if (ctx->overlap_tail) {
+ ret = ctx->overlap_tail(key, key_tmp, ctx);
+ if (ret)
+ goto out;
+ }
+ break;
+ }
+
+ /*
+ * If key_tmp is contained within key.
+ * |----| key_tmp
+ * |==========| key
+ */
+ if (ctx->overlap_contain) {
+ ret = ctx->overlap_contain(key, key_tmp, ctx);
+ if (ret)
+ goto out;
+ }
+
+ goto next;
+ }
+
+ /*
+ * If key_tmp starts before key ends but ends after key.
+ * |-----------| key_tmp
+ * |====| key
+ */
+ if (cache_key_lend(key_tmp) > cache_key_lend(key)) {
+ if (ctx->overlap_contained) {
+ ret = ctx->overlap_contained(key, key_tmp, ctx);
+ if (ret)
+ goto out;
+ }
+ break;
+ }
+
+ /*
+ * If key_tmp starts before key and ends within key.
+ * |--------| key_tmp
+ * |==========| key
+ */
+ if (ctx->overlap_head) {
+ ret = ctx->overlap_head(key, key_tmp, ctx);
+ if (ret)
+ goto out;
+ }
+next:
+ node_tmp = rb_next(node_tmp);
+ }
+
+out:
+ if (ctx->walk_finally)
+ ret = ctx->walk_finally(ctx, ret);
+
+ return ret;
+}
+
+/**
+ * cache_subtree_search - Search for a key in the cache tree.
+ * @cache_subtree: Pointer to the cache tree structure.
+ * @key: Pointer to the cache key to search for.
+ * @parentp: Pointer to store the parent node of the found node.
+ * @newp: Pointer to store the location where the new node should be inserted.
+ * @delete_key_list: List to collect invalid keys for deletion.
+ *
+ * This function searches the cache tree for a specific key and returns
+ * the node that is the predecessor of the key, or first node if the key is
+ * less than all keys in the tree. If any invalid keys are found during
+ * the search, they are added to the delete_key_list for later cleanup.
+ *
+ * Returns a pointer to the previous node.
+ */
+struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key,
+ struct rb_node **parentp, struct rb_node ***newp,
+ struct list_head *delete_key_list)
+{
+ struct rb_node **new, *parent = NULL;
+ struct pcache_cache_key *key_tmp;
+ struct rb_node *prev_node = NULL;
+
+ new = &(cache_subtree->root.rb_node);
+ while (*new) {
+ key_tmp = container_of(*new, struct pcache_cache_key, rb_node);
+ if (cache_key_invalid(key_tmp))
+ list_add(&key_tmp->list_node, delete_key_list);
+
+ parent = *new;
+ if (key_tmp->off >= key->off) {
+ new = &((*new)->rb_left);
+ } else {
+ prev_node = *new;
+ new = &((*new)->rb_right);
+ }
+ }
+
+ if (!prev_node)
+ prev_node = rb_first(&cache_subtree->root);
+
+ if (parentp)
+ *parentp = parent;
+
+ if (newp)
+ *newp = new;
+
+ return prev_node;
+}
+
+static struct pcache_cache_key *get_pre_alloc_key(struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ struct pcache_cache_key *key;
+
+ if (ctx->pre_alloc_key) {
+ key = ctx->pre_alloc_key;
+ ctx->pre_alloc_key = NULL;
+
+ return key;
+ }
+
+ return cache_key_alloc(ctx->cache_tree, GFP_NOWAIT);
+}
+
+/**
+ * fixup_overlap_tail - Adjust the key when it overlaps at the tail.
+ * @key: Pointer to the new cache key being inserted.
+ * @key_tmp: Pointer to the existing key that overlaps.
+ * @ctx: Pointer to the context for walking the cache tree.
+ *
+ * This function modifies the existing key (key_tmp) when there is an
+ * overlap at the tail with the new key. If the modified key becomes
+ * empty, it is deleted.
+ */
+static int fixup_overlap_tail(struct pcache_cache_key *key,
+ struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ /*
+ * |----------------| key_tmp
+ * |===========| key
+ */
+ BUG_ON(cache_key_empty(key));
+ if (cache_key_empty(key_tmp)) {
+ cache_key_delete(key_tmp);
+ return SUBTREE_WALK_RET_RESEARCH;
+ }
+
+ cache_key_cutfront(key_tmp, cache_key_lend(key) - cache_key_lstart(key_tmp));
+ if (key_tmp->len == 0) {
+ cache_key_delete(key_tmp);
+ return SUBTREE_WALK_RET_RESEARCH;
+ }
+
+ return SUBTREE_WALK_RET_OK;
+}
+
+/**
+ * fixup_overlap_contain - Handle case where new key completely contains an existing key.
+ * @key: Pointer to the new cache key being inserted.
+ * @key_tmp: Pointer to the existing key that is being contained.
+ * @ctx: Pointer to the context for walking the cache tree.
+ *
+ * This function deletes the existing key (key_tmp) when the new key
+ * completely contains it. It returns SUBTREE_WALK_RET_RESEARCH to indicate that the
+ * tree structure may have changed, necessitating a re-insertion of
+ * the new key.
+ */
+static int fixup_overlap_contain(struct pcache_cache_key *key,
+ struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ /*
+ * |----| key_tmp
+ * |==========| key
+ */
+ BUG_ON(cache_key_empty(key));
+ cache_key_delete(key_tmp);
+
+ return SUBTREE_WALK_RET_RESEARCH;
+}
+
+/**
+ * fixup_overlap_contained - Handle overlap when a new key is contained in an existing key.
+ * @key: The new cache key being inserted.
+ * @key_tmp: The existing cache key that overlaps with the new key.
+ * @ctx: Context for the cache tree walk.
+ *
+ * This function adjusts the existing key if the new key is contained
+ * within it. If the existing key is empty, it indicates a placeholder key
+ * that was inserted during a miss read. This placeholder will later be
+ * updated with real data from the backing_dev, making it no longer an empty key.
+ *
+ * If we delete key or insert a key, the structure of the entire cache tree may change,
+ * requiring a full research of the tree to find a new insertion point.
+ */
+static int fixup_overlap_contained(struct pcache_cache_key *key,
+ struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ struct pcache_cache_tree *cache_tree = ctx->cache_tree;
+
+ /*
+ * |-----------| key_tmp
+ * |====| key
+ */
+ BUG_ON(cache_key_empty(key));
+ if (cache_key_empty(key_tmp)) {
+ /* If key_tmp is empty, don't split it;
+ * it's a placeholder key for miss reads that will be updated later.
+ */
+ cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key));
+ if (key_tmp->len == 0) {
+ cache_key_delete(key_tmp);
+ return SUBTREE_WALK_RET_RESEARCH;
+ }
+ } else {
+ struct pcache_cache_key *key_fixup;
+ bool need_research = false;
+
+ key_fixup = get_pre_alloc_key(ctx);
+ if (!key_fixup)
+ return SUBTREE_WALK_RET_NEED_KEY;
+
+ cache_key_copy(key_fixup, key_tmp);
+
+ /* Split key_tmp based on the new key's range */
+ cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key));
+ if (key_tmp->len == 0) {
+ cache_key_delete(key_tmp);
+ need_research = true;
+ }
+
+ /* Create a new portion for key_fixup */
+ cache_key_cutfront(key_fixup, cache_key_lend(key) - cache_key_lstart(key_tmp));
+ if (key_fixup->len == 0) {
+ cache_key_put(key_fixup);
+ } else {
+ /* Insert the new key into the cache */
+ cache_key_insert(cache_tree, key_fixup, false);
+ need_research = true;
+ }
+
+ if (need_research)
+ return SUBTREE_WALK_RET_RESEARCH;
+ }
+
+ return SUBTREE_WALK_RET_OK;
+}
+
+/**
+ * fixup_overlap_head - Handle overlap when a new key overlaps with the head of an existing key.
+ * @key: The new cache key being inserted.
+ * @key_tmp: The existing cache key that overlaps with the new key.
+ * @ctx: Context for the cache tree walk.
+ *
+ * This function adjusts the existing key if the new key overlaps
+ * with the beginning of it. If the resulting key length is zero
+ * after the adjustment, the key is deleted. This indicates that
+ * the key no longer holds valid data and requires the tree to be
+ * re-researched for a new insertion point.
+ */
+static int fixup_overlap_head(struct pcache_cache_key *key,
+ struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ /*
+ * |--------| key_tmp
+ * |==========| key
+ */
+ BUG_ON(cache_key_empty(key));
+ /* Adjust key_tmp by cutting back based on the new key's start */
+ cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key));
+ if (key_tmp->len == 0) {
+ /* If the adjusted key_tmp length is zero, delete it */
+ cache_key_delete(key_tmp);
+ return SUBTREE_WALK_RET_RESEARCH;
+ }
+
+ return SUBTREE_WALK_RET_OK;
+}
+
+/**
+ * cache_key_insert - Insert a new cache key into the cache tree.
+ * @cache_tree: Pointer to the cache_tree structure.
+ * @key: The cache key to insert.
+ * @fixup: Indicates if this is a new key being inserted.
+ *
+ * This function searches for the appropriate location to insert
+ * a new cache key into the cache tree. It handles key overlaps
+ * and ensures any invalid keys are removed before insertion.
+ */
+void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup)
+{
+ struct pcache_cache *cache = cache_tree->cache;
+ struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 };
+ struct rb_node **new, *parent = NULL;
+ struct pcache_cache_subtree *cache_subtree;
+ struct pcache_cache_key *key_tmp = NULL, *key_next;
+ struct rb_node *prev_node = NULL;
+ LIST_HEAD(delete_key_list);
+ int ret;
+
+ cache_subtree = get_subtree(cache_tree, key->off);
+ key->cache_subtree = cache_subtree;
+search:
+ prev_node = cache_subtree_search(cache_subtree, key, &parent, &new, &delete_key_list);
+ if (!list_empty(&delete_key_list)) {
+ /* Remove invalid keys from the delete list */
+ list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) {
+ list_del_init(&key_tmp->list_node);
+ cache_key_delete(key_tmp);
+ }
+ goto search;
+ }
+
+ if (fixup) {
+ /* Set up the context with the cache, start node, and new key */
+ walk_ctx.cache_tree = cache_tree;
+ walk_ctx.start_node = prev_node;
+ walk_ctx.key = key;
+
+ /* Assign overlap handling functions for different scenarios */
+ walk_ctx.overlap_tail = fixup_overlap_tail;
+ walk_ctx.overlap_head = fixup_overlap_head;
+ walk_ctx.overlap_contain = fixup_overlap_contain;
+ walk_ctx.overlap_contained = fixup_overlap_contained;
+
+ ret = cache_subtree_walk(&walk_ctx);
+ switch (ret) {
+ case SUBTREE_WALK_RET_OK:
+ break;
+ case SUBTREE_WALK_RET_RESEARCH:
+ goto search;
+ case SUBTREE_WALK_RET_NEED_KEY:
+ spin_unlock(&cache_subtree->tree_lock);
+ pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_key with GFP_NOIO");
+ walk_ctx.pre_alloc_key = cache_key_alloc(cache_tree, GFP_NOIO);
+ spin_lock(&cache_subtree->tree_lock);
+ goto search;
+ default:
+ BUG();
+ }
+ }
+
+ if (walk_ctx.pre_alloc_key)
+ cache_key_put(walk_ctx.pre_alloc_key);
+
+ /* Link and insert the new key into the red-black tree */
+ rb_link_node(&key->rb_node, parent, new);
+ rb_insert_color(&key->rb_node, &cache_subtree->root);
+}
+
+/**
+ * clean_fn - Cleanup function to remove invalid keys from the cache tree.
+ * @work: Pointer to the work_struct associated with the cleanup.
+ *
+ * This function cleans up invalid keys from the cache tree in the background
+ * after a cache segment has been invalidated during cache garbage collection.
+ * It processes a maximum of PCACHE_CLEAN_KEYS_MAX keys per iteration and holds
+ * the tree lock to ensure thread safety.
+ */
+void clean_fn(struct work_struct *work)
+{
+ struct pcache_cache *cache = container_of(work, struct pcache_cache, clean_work);
+ struct pcache_cache_subtree *cache_subtree;
+ struct rb_node *node;
+ struct pcache_cache_key *key;
+ int i, count;
+
+ for (i = 0; i < cache->req_key_tree.n_subtrees; i++) {
+ cache_subtree = &cache->req_key_tree.subtrees[i];
+
+again:
+ if (pcache_is_stopping(CACHE_TO_PCACHE(cache)))
+ return;
+
+ /* Delete up to PCACHE_CLEAN_KEYS_MAX keys in one iteration */
+ count = 0;
+ spin_lock(&cache_subtree->tree_lock);
+ node = rb_first(&cache_subtree->root);
+ while (node) {
+ key = CACHE_KEY(node);
+ node = rb_next(node);
+ if (cache_key_invalid(key)) {
+ count++;
+ cache_key_delete(key);
+ }
+
+ if (count >= PCACHE_CLEAN_KEYS_MAX) {
+ /* Unlock and pause before continuing cleanup */
+ spin_unlock(&cache_subtree->tree_lock);
+ usleep_range(1000, 2000);
+ goto again;
+ }
+ }
+ spin_unlock(&cache_subtree->tree_lock);
+ }
+}
+
+/*
+ * kset_flush_fn - Flush work for a cache kset.
+ *
+ * This function is called when a kset flush work is queued from
+ * cache_key_append(). If the kset is full, it will be closed
+ * immediately. If not, the flush work will be queued for later closure.
+ *
+ * If cache_kset_close detects that a new segment is required to store
+ * the kset and there are no available segments, it will return an error.
+ * In this scenario, a retry will be attempted.
+ */
+void kset_flush_fn(struct work_struct *work)
+{
+ struct pcache_cache_kset *kset = container_of(work, struct pcache_cache_kset, flush_work.work);
+ struct pcache_cache *cache = kset->cache;
+ int ret;
+
+ if (pcache_is_stopping(CACHE_TO_PCACHE(cache)))
+ return;
+
+ spin_lock(&kset->kset_lock);
+ ret = cache_kset_close(cache, kset);
+ spin_unlock(&kset->kset_lock);
+
+ if (ret) {
+ /* Failed to flush kset, schedule a retry. */
+ queue_delayed_work(cache_get_wq(cache), &kset->flush_work, msecs_to_jiffies(100));
+ }
+}
+
+static int kset_replay(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia)
+{
+ struct pcache_cache_key_onmedia *key_onmedia;
+ struct pcache_cache_subtree *cache_subtree;
+ struct pcache_cache_key *key;
+ int ret;
+ int i;
+
+ for (i = 0; i < kset_onmedia->key_num; i++) {
+ key_onmedia = &kset_onmedia->data[i];
+
+ key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO);
+ ret = cache_key_decode(cache, key_onmedia, key);
+ if (ret) {
+ cache_key_put(key);
+ goto err;
+ }
+
+ __set_bit(key->cache_pos.cache_seg->cache_seg_id, cache->seg_map);
+
+ /* Check if the segment generation is valid for insertion. */
+ if (key->seg_gen < key->cache_pos.cache_seg->gen) {
+ cache_key_put(key);
+ } else {
+ cache_subtree = get_subtree(&cache->req_key_tree, key->off);
+ spin_lock(&cache_subtree->tree_lock);
+ cache_key_insert(&cache->req_key_tree, key, true);
+ spin_unlock(&cache_subtree->tree_lock);
+ }
+
+ cache_seg_get(key->cache_pos.cache_seg);
+ }
+
+ return 0;
+err:
+ return ret;
+}
+
+int cache_replay(struct pcache_cache *cache)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_pos pos_tail;
+ struct pcache_cache_pos *pos;
+ struct pcache_cache_kset_onmedia *kset_onmedia;
+ u32 to_copy, count = 0;
+ int ret = 0;
+
+ kset_onmedia = kzalloc(PCACHE_KSET_ONMEDIA_SIZE_MAX, GFP_KERNEL);
+ if (!kset_onmedia)
+ return -ENOMEM;
+
+ cache_pos_copy(&pos_tail, &cache->key_tail);
+ pos = &pos_tail;
+
+ /*
+ * In cache replaying stage, there is no other one will access
+ * cache->seg_map, so we can set bit here without cache->seg_map_lock.
+ */
+ __set_bit(pos->cache_seg->cache_seg_id, cache->seg_map);
+
+ while (true) {
+ to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - pos->seg_off);
+ ret = copy_mc_to_kernel(kset_onmedia, cache_pos_addr(pos), to_copy);
+ if (ret) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (kset_onmedia->magic != PCACHE_KSET_MAGIC ||
+ kset_onmedia->crc != cache_kset_crc(kset_onmedia)) {
+ break;
+ }
+
+ /* Process the last kset and prepare for the next segment. */
+ if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) {
+ struct pcache_cache_segment *next_seg;
+
+ pcache_dev_debug(pcache, "last kset replay, next: %u\n", kset_onmedia->next_cache_seg_id);
+
+ next_seg = &cache->segments[kset_onmedia->next_cache_seg_id];
+
+ pos->cache_seg = next_seg;
+ pos->seg_off = 0;
+
+ __set_bit(pos->cache_seg->cache_seg_id, cache->seg_map);
+ continue;
+ }
+
+ /* Replay the kset and check for errors. */
+ ret = kset_replay(cache, kset_onmedia);
+ if (ret)
+ goto out;
+
+ /* Advance the position after processing the kset. */
+ cache_pos_advance(pos, get_kset_onmedia_size(kset_onmedia));
+ if (++count > 512) {
+ cond_resched();
+ count = 0;
+ }
+ }
+
+ /* Update the key_head position after replaying. */
+ spin_lock(&cache->key_head_lock);
+ cache_pos_copy(&cache->key_head, pos);
+ spin_unlock(&cache->key_head_lock);
+out:
+ kfree(kset_onmedia);
+ return ret;
+}
+
+int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees)
+{
+ int ret;
+ u32 i;
+
+ cache_tree->cache = cache;
+ cache_tree->n_subtrees = n_subtrees;
+
+ ret = mempool_init_slab_pool(&cache_tree->key_pool, 1024, key_cache);
+ if (ret)
+ goto err;
+
+ /*
+ * Allocate and initialize the subtrees array.
+ * Each element is a cache tree structure that contains
+ * an RB tree root and a spinlock for protecting its contents.
+ */
+ cache_tree->subtrees = kvcalloc(cache_tree->n_subtrees, sizeof(struct pcache_cache_subtree), GFP_KERNEL);
+ if (!cache_tree->subtrees) {
+ ret = -ENOMEM;
+ goto key_pool_exit;
+ }
+
+ for (i = 0; i < cache_tree->n_subtrees; i++) {
+ struct pcache_cache_subtree *cache_subtree = &cache_tree->subtrees[i];
+
+ cache_subtree->root = RB_ROOT;
+ spin_lock_init(&cache_subtree->tree_lock);
+ }
+
+ return 0;
+
+key_pool_exit:
+ mempool_exit(&cache_tree->key_pool);
+err:
+ return ret;
+}
+
+void cache_tree_clear(struct pcache_cache_tree *cache_tree)
+{
+ struct pcache_cache_subtree *cache_subtree;
+ struct rb_node *node;
+ struct pcache_cache_key *key;
+ u32 i;
+
+ for (i = 0; i < cache_tree->n_subtrees; i++) {
+ cache_subtree = &cache_tree->subtrees[i];
+
+ spin_lock(&cache_subtree->tree_lock);
+ node = rb_first(&cache_subtree->root);
+ while (node) {
+ key = CACHE_KEY(node);
+ node = rb_next(node);
+
+ cache_key_delete(key);
+ }
+ spin_unlock(&cache_subtree->tree_lock);
+ }
+}
+
+void cache_tree_exit(struct pcache_cache_tree *cache_tree)
+{
+ cache_tree_clear(cache_tree);
+ kvfree(cache_tree->subtrees);
+ mempool_exit(&cache_tree->key_pool);
+}
diff --git a/drivers/md/dm-pcache/cache_req.c b/drivers/md/dm-pcache/cache_req.c
new file mode 100644
index 000000000000..7854a30e07b7
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_req.c
@@ -0,0 +1,836 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "cache.h"
+#include "backing_dev.h"
+#include "cache_dev.h"
+#include "dm_pcache.h"
+
+static int cache_data_head_init(struct pcache_cache *cache)
+{
+ struct pcache_cache_segment *next_seg;
+ struct pcache_cache_data_head *data_head;
+
+ data_head = get_data_head(cache);
+ next_seg = get_cache_segment(cache);
+ if (!next_seg)
+ return -EBUSY;
+
+ cache_seg_get(next_seg);
+ data_head->head_pos.cache_seg = next_seg;
+ data_head->head_pos.seg_off = 0;
+
+ return 0;
+}
+
+/**
+ * cache_data_alloc - Allocate data for a cache key.
+ * @cache: Pointer to the cache structure.
+ * @key: Pointer to the cache key to allocate data for.
+ *
+ * This function tries to allocate space from the cache segment specified by the
+ * data head. If the remaining space in the segment is insufficient to allocate
+ * the requested length for the cache key, it will allocate whatever is available
+ * and adjust the key's length accordingly. This function does not allocate
+ * space that crosses segment boundaries.
+ */
+static int cache_data_alloc(struct pcache_cache *cache, struct pcache_cache_key *key)
+{
+ struct pcache_cache_data_head *data_head;
+ struct pcache_cache_pos *head_pos;
+ struct pcache_cache_segment *cache_seg;
+ u32 seg_remain;
+ u32 allocated = 0, to_alloc;
+ int ret = 0;
+
+ preempt_disable();
+ data_head = get_data_head(cache);
+again:
+ to_alloc = key->len - allocated;
+ if (!data_head->head_pos.cache_seg) {
+ seg_remain = 0;
+ } else {
+ cache_pos_copy(&key->cache_pos, &data_head->head_pos);
+ key->seg_gen = key->cache_pos.cache_seg->gen;
+
+ head_pos = &data_head->head_pos;
+ cache_seg = head_pos->cache_seg;
+ seg_remain = cache_seg_remain(head_pos);
+ }
+
+ if (seg_remain > to_alloc) {
+ /* If remaining space in segment is sufficient for the cache key, allocate it. */
+ cache_pos_advance(head_pos, to_alloc);
+ allocated += to_alloc;
+ cache_seg_get(cache_seg);
+ } else if (seg_remain) {
+ /* If remaining space is not enough, allocate the remaining space and adjust the cache key length. */
+ cache_pos_advance(head_pos, seg_remain);
+ key->len = seg_remain;
+
+ /* Get for key: obtain a reference to the cache segment for the key. */
+ cache_seg_get(cache_seg);
+ /* Put for head_pos->cache_seg: release the reference for the current head's segment. */
+ cache_seg_put(head_pos->cache_seg);
+ head_pos->cache_seg = NULL;
+ } else {
+ /* Initialize a new data head if no segment is available. */
+ ret = cache_data_head_init(cache);
+ if (ret)
+ goto out;
+
+ goto again;
+ }
+
+out:
+ preempt_enable();
+
+ return ret;
+}
+
+static int cache_copy_from_req_bio(struct pcache_cache *cache, struct pcache_cache_key *key,
+ struct pcache_request *pcache_req, u32 bio_off)
+{
+ struct pcache_cache_pos *pos = &key->cache_pos;
+ struct pcache_segment *segment;
+
+ segment = &pos->cache_seg->segment;
+
+ return segment_copy_from_bio(segment, pos->seg_off, key->len, pcache_req->bio, bio_off);
+}
+
+static int cache_copy_to_req_bio(struct pcache_cache *cache, struct pcache_request *pcache_req,
+ u32 bio_off, u32 len, struct pcache_cache_pos *pos, u64 key_gen)
+{
+ struct pcache_cache_segment *cache_seg = pos->cache_seg;
+ struct pcache_segment *segment = &cache_seg->segment;
+ int ret;
+
+ spin_lock(&cache_seg->gen_lock);
+ if (key_gen < cache_seg->gen) {
+ spin_unlock(&cache_seg->gen_lock);
+ return -EINVAL;
+ }
+
+ ret = segment_copy_to_bio(segment, pos->seg_off, len, pcache_req->bio, bio_off);
+ spin_unlock(&cache_seg->gen_lock);
+
+ return ret;
+}
+
+/**
+ * miss_read_end_req - Handle the end of a miss read request.
+ * @backing_req: Pointer to the request structure.
+ * @read_ret: Return value of read.
+ *
+ * This function is called when a backing request to read data from
+ * the backing_dev is completed. If the key associated with the request
+ * is empty (a placeholder), it allocates cache space for the key,
+ * copies the data read from the bio into the cache, and updates
+ * the key's status. If the key has been overwritten by a write
+ * request during this process, it will be deleted from the cache
+ * tree and no further action will be taken.
+ */
+static void miss_read_end_req(struct pcache_backing_dev_req *backing_req, int read_ret)
+{
+ void *priv_data = backing_req->priv_data;
+ struct pcache_request *pcache_req = backing_req->req.upper_req;
+ struct pcache_cache *cache = backing_req->backing_dev->cache;
+ int ret;
+
+ if (priv_data) {
+ struct pcache_cache_key *key;
+ struct pcache_cache_subtree *cache_subtree;
+
+ key = (struct pcache_cache_key *)priv_data;
+ cache_subtree = key->cache_subtree;
+
+ /* if this key was deleted from cache_subtree by a write, key->flags should be cleared,
+ * so if cache_key_empty() return true, this key is still in cache_subtree
+ */
+ spin_lock(&cache_subtree->tree_lock);
+ if (cache_key_empty(key)) {
+ /* Check if the backing request was successful. */
+ if (read_ret) {
+ cache_key_delete(key);
+ goto unlock;
+ }
+
+ /* Allocate cache space for the key and copy data from the backing_dev. */
+ ret = cache_data_alloc(cache, key);
+ if (ret) {
+ cache_key_delete(key);
+ goto unlock;
+ }
+
+ ret = cache_copy_from_req_bio(cache, key, pcache_req, backing_req->req.bio_off);
+ if (ret) {
+ cache_seg_put(key->cache_pos.cache_seg);
+ cache_key_delete(key);
+ goto unlock;
+ }
+ key->flags &= ~PCACHE_CACHE_KEY_FLAGS_EMPTY;
+ key->flags |= PCACHE_CACHE_KEY_FLAGS_CLEAN;
+
+ /* Append the key to the cache. */
+ ret = cache_key_append(cache, key, false);
+ if (ret) {
+ cache_seg_put(key->cache_pos.cache_seg);
+ cache_key_delete(key);
+ goto unlock;
+ }
+ }
+unlock:
+ spin_unlock(&cache_subtree->tree_lock);
+ cache_key_put(key);
+ }
+}
+
+/**
+ * submit_cache_miss_req - Submit a backing request when cache data is missing
+ * @cache: The cache context that manages cache operations
+ * @backing_req: The cache request containing information about the read request
+ *
+ * This function is used to handle cases where a cache read request cannot locate
+ * the required data in the cache. When such a miss occurs during `cache_subtree_walk`,
+ * it triggers a backing read request to fetch data from the backing storage.
+ *
+ * If `pcache_req->priv_data` is set, it points to a `pcache_cache_key`, representing
+ * a new cache key to be inserted into the cache. The function calls `cache_key_insert`
+ * to attempt adding the key. On insertion failure, it releases the key reference and
+ * clears `priv_data` to avoid further processing.
+ */
+static void submit_cache_miss_req(struct pcache_cache *cache, struct pcache_backing_dev_req *backing_req)
+{
+ if (backing_req->priv_data) {
+ struct pcache_cache_key *key;
+
+ /* Attempt to insert the key into the cache if priv_data is set */
+ key = (struct pcache_cache_key *)backing_req->priv_data;
+ cache_key_insert(&cache->req_key_tree, key, true);
+ }
+ backing_dev_req_submit(backing_req, false);
+}
+
+static void cache_miss_req_free(struct pcache_backing_dev_req *backing_req)
+{
+ struct pcache_cache_key *key;
+
+ if (backing_req->priv_data) {
+ key = backing_req->priv_data;
+ backing_req->priv_data = NULL;
+ cache_key_put(key); /* for ->priv_data */
+ cache_key_put(key); /* for init ref in alloc */
+ }
+
+ backing_dev_req_end(backing_req);
+}
+
+static struct pcache_backing_dev_req *cache_miss_req_alloc(struct pcache_cache *cache,
+ struct pcache_request *parent,
+ gfp_t gfp_mask)
+{
+ struct pcache_backing_dev *backing_dev = cache->backing_dev;
+ struct pcache_backing_dev_req *backing_req;
+ struct pcache_cache_key *key = NULL;
+ struct pcache_backing_dev_req_opts req_opts = { 0 };
+
+ req_opts.type = BACKING_DEV_REQ_TYPE_REQ;
+ req_opts.gfp_mask = gfp_mask;
+ req_opts.req.upper_req = parent;
+
+ backing_req = backing_dev_req_alloc(backing_dev, &req_opts);
+ if (!backing_req)
+ return NULL;
+
+ key = cache_key_alloc(&cache->req_key_tree, gfp_mask);
+ if (!key)
+ goto free_backing_req;
+
+ cache_key_get(key);
+ backing_req->priv_data = key;
+
+ return backing_req;
+
+free_backing_req:
+ cache_miss_req_free(backing_req);
+ return NULL;
+}
+
+static void cache_miss_req_init(struct pcache_cache *cache,
+ struct pcache_backing_dev_req *backing_req,
+ struct pcache_request *parent,
+ u32 off, u32 len, bool insert_key)
+{
+ struct pcache_cache_key *key;
+ struct pcache_backing_dev_req_opts req_opts = { 0 };
+
+ req_opts.type = BACKING_DEV_REQ_TYPE_REQ;
+ req_opts.req.upper_req = parent;
+ req_opts.req.req_off = off;
+ req_opts.req.len = len;
+ req_opts.end_fn = miss_read_end_req;
+
+ backing_dev_req_init(backing_req, &req_opts);
+
+ if (insert_key) {
+ key = backing_req->priv_data;
+ key->off = parent->off + off;
+ key->len = len;
+ key->flags |= PCACHE_CACHE_KEY_FLAGS_EMPTY;
+ } else {
+ key = backing_req->priv_data;
+ backing_req->priv_data = NULL;
+ cache_key_put(key);
+ cache_key_put(key);
+ }
+}
+
+static struct pcache_backing_dev_req *get_pre_alloc_req(struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ struct pcache_cache *cache = ctx->cache_tree->cache;
+ struct pcache_request *pcache_req = ctx->pcache_req;
+ struct pcache_backing_dev_req *backing_req;
+
+ if (ctx->pre_alloc_req) {
+ backing_req = ctx->pre_alloc_req;
+ ctx->pre_alloc_req = NULL;
+
+ return backing_req;
+ }
+
+ return cache_miss_req_alloc(cache, pcache_req, GFP_NOWAIT);
+}
+
+/*
+ * In the process of walking the cache tree to locate cached data, this
+ * function handles the situation where the requested data range lies
+ * entirely before an existing cache node (`key_tmp`). This outcome
+ * signifies that the target data is absent from the cache (cache miss).
+ *
+ * To fulfill this portion of the read request, the function creates a
+ * backing request (`backing_req`) for the missing data range represented
+ * by `key`. It then appends this request to the submission list in the
+ * `ctx`, which will later be processed to retrieve the data from backing
+ * storage. After setting up the backing request, `req_done` in `ctx` is
+ * updated to reflect the length of the handled range, and the range
+ * in `key` is adjusted by trimming off the portion that is now handled.
+ *
+ * The scenario handled here:
+ *
+ * |--------| key_tmp (existing cached range)
+ * |====| key (requested range, preceding key_tmp)
+ *
+ * Since `key` is before `key_tmp`, it signifies that the requested data
+ * range is missing in the cache (cache miss) and needs retrieval from
+ * backing storage.
+ */
+static int read_before(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ struct pcache_backing_dev_req *backing_req;
+ struct pcache_cache *cache = ctx->cache_tree->cache;
+
+ /*
+ * In this scenario, `key` represents a range that precedes `key_tmp`,
+ * meaning the requested data range is missing from the cache tree
+ * and must be retrieved from the backing_dev.
+ */
+ backing_req = get_pre_alloc_req(ctx);
+ if (!backing_req)
+ return SUBTREE_WALK_RET_NEED_REQ;
+
+ cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true);
+
+ list_add(&backing_req->node, ctx->submit_req_list);
+ ctx->req_done += key->len;
+ cache_key_cutfront(key, key->len);
+
+ return SUBTREE_WALK_RET_OK;
+}
+
+/*
+ * During cache_subtree_walk, this function manages a scenario where part of the
+ * requested data range overlaps with an existing cache node (`key_tmp`).
+ *
+ * |----------------| key_tmp (existing cached range)
+ * |===========| key (requested range, overlapping the tail of key_tmp)
+ */
+static int read_overlap_tail(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ struct pcache_cache *cache = ctx->cache_tree->cache;
+ struct pcache_backing_dev_req *backing_req;
+ u32 io_len;
+ int ret;
+
+ /*
+ * Calculate the length of the non-overlapping portion of `key`
+ * before `key_tmp`, representing the data missing in the cache.
+ */
+ io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key);
+ if (io_len) {
+ backing_req = get_pre_alloc_req(ctx);
+ if (!backing_req)
+ return SUBTREE_WALK_RET_NEED_REQ;
+
+ cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true);
+
+ list_add(&backing_req->node, ctx->submit_req_list);
+ ctx->req_done += io_len;
+ cache_key_cutfront(key, io_len);
+ }
+
+ /*
+ * Handle the overlapping portion by calculating the length of
+ * the remaining data in `key` that coincides with `key_tmp`.
+ */
+ io_len = cache_key_lend(key) - cache_key_lstart(key_tmp);
+ if (cache_key_empty(key_tmp)) {
+ backing_req = get_pre_alloc_req(ctx);
+ if (!backing_req)
+ return SUBTREE_WALK_RET_NEED_REQ;
+
+ cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false);
+ submit_cache_miss_req(cache, backing_req);
+ } else {
+ ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
+ io_len, &key_tmp->cache_pos, key_tmp->seg_gen);
+ if (ret) {
+ if (ret == -EINVAL) {
+ cache_key_delete(key_tmp);
+ return SUBTREE_WALK_RET_RESEARCH;
+ }
+
+ ctx->ret = ret;
+ return SUBTREE_WALK_RET_ERR;
+ }
+ }
+
+ ctx->req_done += io_len;
+ cache_key_cutfront(key, io_len);
+
+ return SUBTREE_WALK_RET_OK;
+}
+
+/*
+ * |----| key_tmp (existing cached range)
+ * |==========| key (requested range)
+ */
+static int read_overlap_contain(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ struct pcache_cache *cache = ctx->cache_tree->cache;
+ struct pcache_backing_dev_req *backing_req;
+ u32 io_len;
+ int ret;
+
+ /*
+ * Calculate the non-overlapping part of `key` before `key_tmp`
+ * to identify the missing data length.
+ */
+ io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key);
+ if (io_len) {
+ backing_req = get_pre_alloc_req(ctx);
+ if (!backing_req)
+ return SUBTREE_WALK_RET_NEED_REQ;
+
+ cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true);
+
+ list_add(&backing_req->node, ctx->submit_req_list);
+
+ ctx->req_done += io_len;
+ cache_key_cutfront(key, io_len);
+ }
+
+ /*
+ * Handle the overlapping portion between `key` and `key_tmp`.
+ */
+ io_len = key_tmp->len;
+ if (cache_key_empty(key_tmp)) {
+ backing_req = get_pre_alloc_req(ctx);
+ if (!backing_req)
+ return SUBTREE_WALK_RET_NEED_REQ;
+
+ cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false);
+ submit_cache_miss_req(cache, backing_req);
+ } else {
+ ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
+ io_len, &key_tmp->cache_pos, key_tmp->seg_gen);
+ if (ret) {
+ if (ret == -EINVAL) {
+ cache_key_delete(key_tmp);
+ return SUBTREE_WALK_RET_RESEARCH;
+ }
+
+ ctx->ret = ret;
+ return SUBTREE_WALK_RET_ERR;
+ }
+ }
+
+ ctx->req_done += io_len;
+ cache_key_cutfront(key, io_len);
+
+ return SUBTREE_WALK_RET_OK;
+}
+
+/*
+ * |-----------| key_tmp (existing cached range)
+ * |====| key (requested range, fully within key_tmp)
+ *
+ * If `key_tmp` contains valid cached data, this function copies the relevant
+ * portion to the request's bio. Otherwise, it sends a backing request to
+ * fetch the required data range.
+ */
+static int read_overlap_contained(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ struct pcache_cache *cache = ctx->cache_tree->cache;
+ struct pcache_backing_dev_req *backing_req;
+ struct pcache_cache_pos pos;
+ int ret;
+
+ /*
+ * Check if `key_tmp` is empty, indicating a miss. If so, initiate
+ * a backing request to fetch the required data for `key`.
+ */
+ if (cache_key_empty(key_tmp)) {
+ backing_req = get_pre_alloc_req(ctx);
+ if (!backing_req)
+ return SUBTREE_WALK_RET_NEED_REQ;
+
+ cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, false);
+ submit_cache_miss_req(cache, backing_req);
+ } else {
+ cache_pos_copy(&pos, &key_tmp->cache_pos);
+ cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp));
+
+ ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
+ key->len, &pos, key_tmp->seg_gen);
+ if (ret) {
+ if (ret == -EINVAL) {
+ cache_key_delete(key_tmp);
+ return SUBTREE_WALK_RET_RESEARCH;
+ }
+
+ ctx->ret = ret;
+ return SUBTREE_WALK_RET_ERR;
+ }
+ }
+
+ ctx->req_done += key->len;
+ cache_key_cutfront(key, key->len);
+
+ return SUBTREE_WALK_RET_OK;
+}
+
+/*
+ * |--------| key_tmp (existing cached range)
+ * |==========| key (requested range, overlapping the head of key_tmp)
+ */
+static int read_overlap_head(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+ struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ struct pcache_cache *cache = ctx->cache_tree->cache;
+ struct pcache_backing_dev_req *backing_req;
+ struct pcache_cache_pos pos;
+ u32 io_len;
+ int ret;
+
+ io_len = cache_key_lend(key_tmp) - cache_key_lstart(key);
+
+ if (cache_key_empty(key_tmp)) {
+ backing_req = get_pre_alloc_req(ctx);
+ if (!backing_req)
+ return SUBTREE_WALK_RET_NEED_REQ;
+
+ cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false);
+ submit_cache_miss_req(cache, backing_req);
+ } else {
+ cache_pos_copy(&pos, &key_tmp->cache_pos);
+ cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp));
+
+ ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
+ io_len, &pos, key_tmp->seg_gen);
+ if (ret) {
+ if (ret == -EINVAL) {
+ cache_key_delete(key_tmp);
+ return SUBTREE_WALK_RET_RESEARCH;
+ }
+
+ ctx->ret = ret;
+ return SUBTREE_WALK_RET_ERR;
+ }
+ }
+
+ ctx->req_done += io_len;
+ cache_key_cutfront(key, io_len);
+
+ return SUBTREE_WALK_RET_OK;
+}
+
+/**
+ * read_walk_finally - Finalizes the cache read tree walk by submitting any
+ * remaining backing requests
+ * @ctx: Context structure holding information about the cache,
+ * read request, and submission list
+ * @ret: the return value after this walk.
+ *
+ * This function is called at the end of the `cache_subtree_walk` during a
+ * cache read operation. It completes the walk by checking if any data
+ * requested by `key` was not found in the cache tree, and if so, it sends
+ * a backing request to retrieve that data. Then, it iterates through the
+ * submission list of backing requests created during the walk, removing
+ * each request from the list and submitting it.
+ *
+ * The scenario managed here includes:
+ * - Sending a backing request for the remaining length of `key` if it was
+ * not fulfilled by existing cache entries.
+ * - Iterating through `ctx->submit_req_list` to submit each backing request
+ * enqueued during the walk.
+ *
+ * This ensures all necessary backing requests for cache misses are submitted
+ * to the backing storage to retrieve any data that could not be found in
+ * the cache.
+ */
+static int read_walk_finally(struct pcache_cache_subtree_walk_ctx *ctx, int ret)
+{
+ struct pcache_cache *cache = ctx->cache_tree->cache;
+ struct pcache_backing_dev_req *backing_req, *next_req;
+ struct pcache_cache_key *key = ctx->key;
+
+ list_for_each_entry_safe(backing_req, next_req, ctx->submit_req_list, node) {
+ list_del_init(&backing_req->node);
+ submit_cache_miss_req(ctx->cache_tree->cache, backing_req);
+ }
+
+ if (ret != SUBTREE_WALK_RET_OK)
+ return ret;
+
+ if (key->len) {
+ backing_req = get_pre_alloc_req(ctx);
+ if (!backing_req)
+ return SUBTREE_WALK_RET_NEED_REQ;
+
+ cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true);
+ submit_cache_miss_req(cache, backing_req);
+ ctx->req_done += key->len;
+ }
+
+ return SUBTREE_WALK_RET_OK;
+}
+
+/*
+ * This function is used within `cache_subtree_walk` to determine whether the
+ * read operation has covered the requested data length. It compares the
+ * amount of data processed (`ctx->req_done`) with the total data length
+ * specified in the original request (`ctx->pcache_req->data_len`).
+ *
+ * If `req_done` meets or exceeds the required data length, the function
+ * returns `true`, indicating the walk is complete. Otherwise, it returns `false`,
+ * signaling that additional data processing is needed to fulfill the request.
+ */
+static bool read_walk_done(struct pcache_cache_subtree_walk_ctx *ctx)
+{
+ return (ctx->req_done >= ctx->pcache_req->data_len);
+}
+
+/**
+ * cache_read - Process a read request by traversing the cache tree
+ * @cache: Cache structure holding cache trees and related configurations
+ * @pcache_req: Request structure with information about the data to read
+ *
+ * This function attempts to fulfill a read request by traversing the cache tree(s)
+ * to locate cached data for the requested range. If parts of the data are missing
+ * in the cache, backing requests are generated to retrieve the required segments.
+ *
+ * The function operates by initializing a key for the requested data range and
+ * preparing a context (`walk_ctx`) to manage the cache tree traversal. The context
+ * includes pointers to functions (e.g., `read_before`, `read_overlap_tail`) that handle
+ * specific conditions encountered during the traversal. The `walk_finally` and `walk_done`
+ * functions manage the end stages of the traversal, while the `delete_key_list` and
+ * `submit_req_list` lists track any keys to be deleted or requests to be submitted.
+ *
+ * The function first calculates the requested range and checks if it fits within the
+ * current cache tree (based on the tree's size limits). It then locks the cache tree
+ * and performs a search to locate any matching keys. If there are outdated keys,
+ * these are deleted, and the search is restarted to ensure accurate data retrieval.
+ *
+ * If the requested range spans multiple cache trees, the function moves on to the
+ * next tree once the current range has been processed. This continues until the
+ * entire requested data length has been handled.
+ */
+static int cache_read(struct pcache_cache *cache, struct pcache_request *pcache_req)
+{
+ struct pcache_cache_key key_data = { .off = pcache_req->off, .len = pcache_req->data_len };
+ struct pcache_cache_subtree *cache_subtree;
+ struct pcache_cache_key *key_tmp = NULL, *key_next;
+ struct rb_node *prev_node = NULL;
+ struct pcache_cache_key *key = &key_data;
+ struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 };
+ struct pcache_backing_dev_req *backing_req, *next_req;
+ LIST_HEAD(delete_key_list);
+ LIST_HEAD(submit_req_list);
+ int ret;
+
+ walk_ctx.cache_tree = &cache->req_key_tree;
+ walk_ctx.req_done = 0;
+ walk_ctx.pcache_req = pcache_req;
+ walk_ctx.before = read_before;
+ walk_ctx.overlap_tail = read_overlap_tail;
+ walk_ctx.overlap_head = read_overlap_head;
+ walk_ctx.overlap_contain = read_overlap_contain;
+ walk_ctx.overlap_contained = read_overlap_contained;
+ walk_ctx.walk_finally = read_walk_finally;
+ walk_ctx.walk_done = read_walk_done;
+ walk_ctx.delete_key_list = &delete_key_list;
+ walk_ctx.submit_req_list = &submit_req_list;
+
+next:
+ key->off = pcache_req->off + walk_ctx.req_done;
+ key->len = pcache_req->data_len - walk_ctx.req_done;
+ if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK))
+ key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK);
+
+ cache_subtree = get_subtree(&cache->req_key_tree, key->off);
+ spin_lock(&cache_subtree->tree_lock);
+search:
+ prev_node = cache_subtree_search(cache_subtree, key, NULL, NULL, &delete_key_list);
+ if (!list_empty(&delete_key_list)) {
+ list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) {
+ list_del_init(&key_tmp->list_node);
+ cache_key_delete(key_tmp);
+ }
+ goto search;
+ }
+
+ walk_ctx.start_node = prev_node;
+ walk_ctx.key = key;
+
+ ret = cache_subtree_walk(&walk_ctx);
+ if (ret == SUBTREE_WALK_RET_RESEARCH)
+ goto search;
+ spin_unlock(&cache_subtree->tree_lock);
+
+ if (ret == SUBTREE_WALK_RET_ERR) {
+ ret = walk_ctx.ret;
+ goto out;
+ }
+
+ if (ret == SUBTREE_WALK_RET_NEED_REQ) {
+ walk_ctx.pre_alloc_req = cache_miss_req_alloc(cache, pcache_req, GFP_NOIO);
+ pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_req with GFP_NOIO");
+ }
+
+ if (walk_ctx.req_done < pcache_req->data_len)
+ goto next;
+ ret = 0;
+out:
+ if (walk_ctx.pre_alloc_req)
+ cache_miss_req_free(walk_ctx.pre_alloc_req);
+
+ list_for_each_entry_safe(backing_req, next_req, &submit_req_list, node) {
+ list_del_init(&backing_req->node);
+ backing_dev_req_end(backing_req);
+ }
+
+ return ret;
+}
+
+static int cache_write(struct pcache_cache *cache, struct pcache_request *pcache_req)
+{
+ struct pcache_cache_subtree *cache_subtree;
+ struct pcache_cache_key *key;
+ u64 offset = pcache_req->off;
+ u32 length = pcache_req->data_len;
+ u32 io_done = 0;
+ int ret;
+
+ while (true) {
+ if (io_done >= length)
+ break;
+
+ key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO);
+ key->off = offset + io_done;
+ key->len = length - io_done;
+ if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK))
+ key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK);
+
+ ret = cache_data_alloc(cache, key);
+ if (ret) {
+ cache_key_put(key);
+ goto err;
+ }
+
+ ret = cache_copy_from_req_bio(cache, key, pcache_req, io_done);
+ if (ret) {
+ cache_seg_put(key->cache_pos.cache_seg);
+ cache_key_put(key);
+ goto err;
+ }
+
+ cache_subtree = get_subtree(&cache->req_key_tree, key->off);
+ spin_lock(&cache_subtree->tree_lock);
+ cache_key_insert(&cache->req_key_tree, key, true);
+ ret = cache_key_append(cache, key, pcache_req->bio->bi_opf & REQ_FUA);
+ if (ret) {
+ cache_seg_put(key->cache_pos.cache_seg);
+ cache_key_delete(key);
+ goto unlock;
+ }
+
+ io_done += key->len;
+ spin_unlock(&cache_subtree->tree_lock);
+ }
+
+ return 0;
+unlock:
+ spin_unlock(&cache_subtree->tree_lock);
+err:
+ return ret;
+}
+
+/**
+ * pcache_cache_flush - Flush all ksets to persist any pending cache data
+ * @cache: Pointer to the cache structure
+ *
+ * This function iterates through all ksets associated with the provided `cache`
+ * and ensures that any data marked for persistence is written to media. For each
+ * kset, it acquires the kset lock, then invokes `cache_kset_close`, which handles
+ * the persistence logic for that kset.
+ *
+ * If `cache_kset_close` encounters an error, the function exits immediately with
+ * the respective error code, preventing the flush operation from proceeding to
+ * subsequent ksets.
+ */
+int pcache_cache_flush(struct pcache_cache *cache)
+{
+ struct pcache_cache_kset *kset;
+ int ret;
+ u32 i;
+
+ for (i = 0; i < cache->n_ksets; i++) {
+ kset = get_kset(cache, i);
+
+ spin_lock(&kset->kset_lock);
+ ret = cache_kset_close(cache, kset);
+ spin_unlock(&kset->kset_lock);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req)
+{
+ struct bio *bio = pcache_req->bio;
+
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH))
+ return pcache_cache_flush(cache);
+
+ if (bio_data_dir(bio) == READ)
+ return cache_read(cache, pcache_req);
+
+ return cache_write(cache, pcache_req);
+}
diff --git a/drivers/md/dm-pcache/cache_segment.c b/drivers/md/dm-pcache/cache_segment.c
new file mode 100644
index 000000000000..f0b58980806e
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_segment.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "cache_dev.h"
+#include "cache.h"
+#include "backing_dev.h"
+#include "dm_pcache.h"
+
+static inline struct pcache_segment_info *get_seg_info_addr(struct pcache_cache_segment *cache_seg)
+{
+ struct pcache_segment_info *seg_info_addr;
+ u32 seg_id = cache_seg->segment.seg_id;
+ void *seg_addr;
+
+ seg_addr = CACHE_DEV_SEGMENT(cache_seg->cache->cache_dev, seg_id);
+ seg_info_addr = seg_addr + PCACHE_SEG_INFO_SIZE * cache_seg->info_index;
+
+ return seg_info_addr;
+}
+
+static void cache_seg_info_write(struct pcache_cache_segment *cache_seg)
+{
+ struct pcache_segment_info *seg_info_addr;
+ struct pcache_segment_info *seg_info = &cache_seg->cache_seg_info;
+
+ mutex_lock(&cache_seg->info_lock);
+ seg_info->header.seq++;
+ seg_info->header.crc = pcache_meta_crc(&seg_info->header, sizeof(struct pcache_segment_info));
+
+ seg_info_addr = get_seg_info_addr(cache_seg);
+ memcpy_flushcache(seg_info_addr, seg_info, sizeof(struct pcache_segment_info));
+ pmem_wmb();
+
+ cache_seg->info_index = (cache_seg->info_index + 1) % PCACHE_META_INDEX_MAX;
+ mutex_unlock(&cache_seg->info_lock);
+}
+
+static int cache_seg_info_load(struct pcache_cache_segment *cache_seg)
+{
+ struct pcache_segment_info *cache_seg_info_addr_base, *cache_seg_info_addr;
+ struct pcache_cache_dev *cache_dev = cache_seg->cache->cache_dev;
+ struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
+ u32 seg_id = cache_seg->segment.seg_id;
+ int ret = 0;
+
+ cache_seg_info_addr_base = CACHE_DEV_SEGMENT(cache_dev, seg_id);
+
+ mutex_lock(&cache_seg->info_lock);
+ cache_seg_info_addr = pcache_meta_find_latest(&cache_seg_info_addr_base->header,
+ sizeof(struct pcache_segment_info),
+ PCACHE_SEG_INFO_SIZE,
+ &cache_seg->cache_seg_info);
+ if (IS_ERR(cache_seg_info_addr)) {
+ ret = PTR_ERR(cache_seg_info_addr);
+ goto out;
+ } else if (!cache_seg_info_addr) {
+ ret = -EIO;
+ goto out;
+ }
+ cache_seg->info_index = cache_seg_info_addr - cache_seg_info_addr_base;
+out:
+ mutex_unlock(&cache_seg->info_lock);
+
+ if (ret)
+ pcache_dev_err(pcache, "can't read segment info of segment: %u, ret: %d\n",
+ cache_seg->segment.seg_id, ret);
+ return ret;
+}
+
+static int cache_seg_ctrl_load(struct pcache_cache_segment *cache_seg)
+{
+ struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl;
+ struct pcache_cache_seg_gen cache_seg_gen, *cache_seg_gen_addr;
+ int ret = 0;
+
+ cache_seg_gen_addr = pcache_meta_find_latest(&cache_seg_ctrl->gen->header,
+ sizeof(struct pcache_cache_seg_gen),
+ sizeof(struct pcache_cache_seg_gen),
+ &cache_seg_gen);
+ if (IS_ERR(cache_seg_gen_addr)) {
+ ret = PTR_ERR(cache_seg_gen_addr);
+ goto out;
+ }
+
+ if (!cache_seg_gen_addr) {
+ cache_seg->gen = 0;
+ cache_seg->gen_seq = 0;
+ cache_seg->gen_index = 0;
+ goto out;
+ }
+
+ cache_seg->gen = cache_seg_gen.gen;
+ cache_seg->gen_seq = cache_seg_gen.header.seq;
+ cache_seg->gen_index = (cache_seg_gen_addr - cache_seg_ctrl->gen);
+out:
+
+ return ret;
+}
+
+static inline struct pcache_cache_seg_gen *get_cache_seg_gen_addr(struct pcache_cache_segment *cache_seg)
+{
+ struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl;
+
+ return (cache_seg_ctrl->gen + cache_seg->gen_index);
+}
+
+/*
+ * cache_seg_ctrl_write - write cache segment control information
+ * @seg: the cache segment to update
+ *
+ * This function writes the control information of a cache segment to media.
+ *
+ * Although this updates shared control data, we intentionally do not use
+ * any locking here. All accesses to control information are single-threaded:
+ *
+ * - All reads occur during the init phase, where no concurrent writes
+ * can happen.
+ * - Writes happen once during init and once when the last reference
+ * to the segment is dropped in cache_seg_put().
+ *
+ * Both cases are guaranteed to be single-threaded, so there is no risk
+ * of concurrent read/write races.
+ */
+static void cache_seg_ctrl_write(struct pcache_cache_segment *cache_seg)
+{
+ struct pcache_cache_seg_gen cache_seg_gen;
+
+ cache_seg_gen.gen = cache_seg->gen;
+ cache_seg_gen.header.seq = ++cache_seg->gen_seq;
+ cache_seg_gen.header.crc = pcache_meta_crc(&cache_seg_gen.header,
+ sizeof(struct pcache_cache_seg_gen));
+
+ memcpy_flushcache(get_cache_seg_gen_addr(cache_seg), &cache_seg_gen, sizeof(struct pcache_cache_seg_gen));
+ pmem_wmb();
+
+ cache_seg->gen_index = (cache_seg->gen_index + 1) % PCACHE_META_INDEX_MAX;
+}
+
+static void cache_seg_ctrl_init(struct pcache_cache_segment *cache_seg)
+{
+ cache_seg->gen = 0;
+ cache_seg->gen_seq = 0;
+ cache_seg->gen_index = 0;
+ cache_seg_ctrl_write(cache_seg);
+}
+
+static int cache_seg_meta_load(struct pcache_cache_segment *cache_seg)
+{
+ int ret;
+
+ ret = cache_seg_info_load(cache_seg);
+ if (ret)
+ goto err;
+
+ ret = cache_seg_ctrl_load(cache_seg);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ return ret;
+}
+
+/**
+ * cache_seg_set_next_seg - Sets the ID of the next segment
+ * @cache_seg: Pointer to the cache segment structure.
+ * @seg_id: The segment ID to set as the next segment.
+ *
+ * A pcache_cache allocates multiple cache segments, which are linked together
+ * through next_seg. When loading a pcache_cache, the first cache segment can
+ * be found using cache->seg_id, which allows access to all the cache segments.
+ */
+void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id)
+{
+ cache_seg->cache_seg_info.flags |= PCACHE_SEG_INFO_FLAGS_HAS_NEXT;
+ cache_seg->cache_seg_info.next_seg = seg_id;
+ cache_seg_info_write(cache_seg);
+}
+
+int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id,
+ bool new_cache)
+{
+ struct pcache_cache_dev *cache_dev = cache->cache_dev;
+ struct pcache_cache_segment *cache_seg = &cache->segments[cache_seg_id];
+ struct pcache_segment_init_options seg_options = { 0 };
+ struct pcache_segment *segment = &cache_seg->segment;
+ int ret;
+
+ cache_seg->cache = cache;
+ cache_seg->cache_seg_id = cache_seg_id;
+ spin_lock_init(&cache_seg->gen_lock);
+ atomic_set(&cache_seg->refs, 0);
+ mutex_init(&cache_seg->info_lock);
+
+ /* init pcache_segment */
+ seg_options.type = PCACHE_SEGMENT_TYPE_CACHE_DATA;
+ seg_options.data_off = PCACHE_CACHE_SEG_CTRL_OFF + PCACHE_CACHE_SEG_CTRL_SIZE;
+ seg_options.seg_id = seg_id;
+ seg_options.seg_info = &cache_seg->cache_seg_info;
+ pcache_segment_init(cache_dev, segment, &seg_options);
+
+ cache_seg->cache_seg_ctrl = CACHE_DEV_SEGMENT(cache_dev, seg_id) + PCACHE_CACHE_SEG_CTRL_OFF;
+
+ if (new_cache) {
+ cache_dev_zero_range(cache_dev, CACHE_DEV_SEGMENT(cache_dev, seg_id),
+ PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX +
+ PCACHE_CACHE_SEG_CTRL_SIZE);
+
+ cache_seg_ctrl_init(cache_seg);
+
+ cache_seg->info_index = 0;
+ cache_seg_info_write(cache_seg);
+
+ /* clear outdated kset in segment */
+ memcpy_flushcache(segment->data, &pcache_empty_kset, sizeof(struct pcache_cache_kset_onmedia));
+ pmem_wmb();
+ } else {
+ ret = cache_seg_meta_load(cache_seg);
+ if (ret)
+ goto err;
+ }
+
+ return 0;
+err:
+ return ret;
+}
+
+/**
+ * get_cache_segment - Retrieves a free cache segment from the cache.
+ * @cache: Pointer to the cache structure.
+ *
+ * This function attempts to find a free cache segment that can be used.
+ * It locks the segment map and checks for the next available segment ID.
+ * If a free segment is found, it initializes it and returns a pointer to the
+ * cache segment structure. Returns NULL if no segments are available.
+ */
+struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache)
+{
+ struct pcache_cache_segment *cache_seg;
+ u32 seg_id;
+
+ spin_lock(&cache->seg_map_lock);
+again:
+ seg_id = find_next_zero_bit(cache->seg_map, cache->n_segs, cache->last_cache_seg);
+ if (seg_id == cache->n_segs) {
+ /* reset the hint of ->last_cache_seg and retry */
+ if (cache->last_cache_seg) {
+ cache->last_cache_seg = 0;
+ goto again;
+ }
+ cache->cache_full = true;
+ spin_unlock(&cache->seg_map_lock);
+ return NULL;
+ }
+
+ /*
+ * found an available cache_seg, mark it used in seg_map
+ * and update the search hint ->last_cache_seg
+ */
+ __set_bit(seg_id, cache->seg_map);
+ cache->last_cache_seg = seg_id;
+ spin_unlock(&cache->seg_map_lock);
+
+ cache_seg = &cache->segments[seg_id];
+ cache_seg->cache_seg_id = seg_id;
+
+ return cache_seg;
+}
+
+static void cache_seg_gen_increase(struct pcache_cache_segment *cache_seg)
+{
+ spin_lock(&cache_seg->gen_lock);
+ cache_seg->gen++;
+ spin_unlock(&cache_seg->gen_lock);
+
+ cache_seg_ctrl_write(cache_seg);
+}
+
+void cache_seg_get(struct pcache_cache_segment *cache_seg)
+{
+ atomic_inc(&cache_seg->refs);
+}
+
+static void cache_seg_invalidate(struct pcache_cache_segment *cache_seg)
+{
+ struct pcache_cache *cache;
+
+ cache = cache_seg->cache;
+ cache_seg_gen_increase(cache_seg);
+
+ spin_lock(&cache->seg_map_lock);
+ if (cache->cache_full)
+ cache->cache_full = false;
+ __clear_bit(cache_seg->cache_seg_id, cache->seg_map);
+ spin_unlock(&cache->seg_map_lock);
+
+ pcache_defer_reqs_kick(CACHE_TO_PCACHE(cache));
+ /* clean_work will clean the bad key in key_tree*/
+ queue_work(cache_get_wq(cache), &cache->clean_work);
+}
+
+void cache_seg_put(struct pcache_cache_segment *cache_seg)
+{
+ if (atomic_dec_and_test(&cache_seg->refs))
+ cache_seg_invalidate(cache_seg);
+}
diff --git a/drivers/md/dm-pcache/cache_writeback.c b/drivers/md/dm-pcache/cache_writeback.c
new file mode 100644
index 000000000000..87a82b3fe836
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_writeback.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/bio.h>
+
+#include "cache.h"
+#include "backing_dev.h"
+#include "cache_dev.h"
+#include "dm_pcache.h"
+
+static void writeback_ctx_end(struct pcache_cache *cache, int ret)
+{
+ if (ret && !cache->writeback_ctx.ret) {
+ pcache_dev_err(CACHE_TO_PCACHE(cache), "writeback error: %d", ret);
+ cache->writeback_ctx.ret = ret;
+ }
+
+ if (!atomic_dec_and_test(&cache->writeback_ctx.pending))
+ return;
+
+ if (!cache->writeback_ctx.ret) {
+ backing_dev_flush(cache->backing_dev);
+
+ mutex_lock(&cache->dirty_tail_lock);
+ cache_pos_advance(&cache->dirty_tail, cache->writeback_ctx.advance);
+ cache_encode_dirty_tail(cache);
+ mutex_unlock(&cache->dirty_tail_lock);
+ }
+ queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0);
+}
+
+static void writeback_end_req(struct pcache_backing_dev_req *backing_req, int ret)
+{
+ struct pcache_cache *cache = backing_req->priv_data;
+
+ mutex_lock(&cache->writeback_lock);
+ writeback_ctx_end(cache, ret);
+ mutex_unlock(&cache->writeback_lock);
+}
+
+static inline bool is_cache_clean(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_kset_onmedia *kset_onmedia;
+ u32 to_copy;
+ void *addr;
+ int ret;
+
+ addr = cache_pos_addr(dirty_tail);
+ kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf;
+
+ to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - dirty_tail->seg_off);
+ ret = copy_mc_to_kernel(kset_onmedia, addr, to_copy);
+ if (ret) {
+ pcache_dev_err(pcache, "error to read kset: %d", ret);
+ return true;
+ }
+
+ /* Check if the magic number matches the expected value */
+ if (kset_onmedia->magic != PCACHE_KSET_MAGIC) {
+ pcache_dev_debug(pcache, "dirty_tail: %u:%u magic: %llx, not expected: %llx\n",
+ dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off,
+ kset_onmedia->magic, PCACHE_KSET_MAGIC);
+ return true;
+ }
+
+ /* Verify the CRC checksum for data integrity */
+ if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) {
+ pcache_dev_debug(pcache, "dirty_tail: %u:%u crc: %x, not expected: %x\n",
+ dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off,
+ cache_kset_crc(kset_onmedia), kset_onmedia->crc);
+ return true;
+ }
+
+ return false;
+}
+
+void cache_writeback_exit(struct pcache_cache *cache)
+{
+ cancel_delayed_work_sync(&cache->writeback_work);
+ backing_dev_flush(cache->backing_dev);
+ cache_tree_exit(&cache->writeback_key_tree);
+}
+
+int cache_writeback_init(struct pcache_cache *cache)
+{
+ int ret;
+
+ ret = cache_tree_init(cache, &cache->writeback_key_tree, 1);
+ if (ret)
+ goto err;
+
+ atomic_set(&cache->writeback_ctx.pending, 0);
+
+ /* Queue delayed work to start writeback handling */
+ queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0);
+
+ return 0;
+err:
+ return ret;
+}
+
+static void cache_key_writeback(struct pcache_cache *cache, struct pcache_cache_key *key)
+{
+ struct pcache_backing_dev_req *writeback_req;
+ struct pcache_backing_dev_req_opts writeback_req_opts = { 0 };
+ struct pcache_cache_pos *pos;
+ void *addr;
+ u32 seg_remain, req_len, done = 0;
+
+ if (cache_key_clean(key))
+ return;
+
+ pos = &key->cache_pos;
+
+ seg_remain = cache_seg_remain(pos);
+ BUG_ON(seg_remain < key->len);
+next_req:
+ addr = cache_pos_addr(pos) + done;
+ req_len = backing_dev_req_coalesced_max_len(addr, key->len - done);
+
+ writeback_req_opts.type = BACKING_DEV_REQ_TYPE_KMEM;
+ writeback_req_opts.gfp_mask = GFP_NOIO;
+ writeback_req_opts.end_fn = writeback_end_req;
+ writeback_req_opts.priv_data = cache;
+
+ writeback_req_opts.kmem.data = addr;
+ writeback_req_opts.kmem.opf = REQ_OP_WRITE;
+ writeback_req_opts.kmem.len = req_len;
+ writeback_req_opts.kmem.backing_off = key->off + done;
+
+ writeback_req = backing_dev_req_create(cache->backing_dev, &writeback_req_opts);
+
+ atomic_inc(&cache->writeback_ctx.pending);
+ backing_dev_req_submit(writeback_req, true);
+
+ done += req_len;
+ if (done < key->len)
+ goto next_req;
+}
+
+static void cache_wb_tree_writeback(struct pcache_cache *cache, u32 advance)
+{
+ struct pcache_cache_tree *cache_tree = &cache->writeback_key_tree;
+ struct pcache_cache_subtree *cache_subtree;
+ struct rb_node *node;
+ struct pcache_cache_key *key;
+ u32 i;
+
+ cache->writeback_ctx.ret = 0;
+ cache->writeback_ctx.advance = advance;
+ atomic_set(&cache->writeback_ctx.pending, 1);
+
+ for (i = 0; i < cache_tree->n_subtrees; i++) {
+ cache_subtree = &cache_tree->subtrees[i];
+
+ node = rb_first(&cache_subtree->root);
+ while (node) {
+ key = CACHE_KEY(node);
+ node = rb_next(node);
+
+ cache_key_writeback(cache, key);
+ cache_key_delete(key);
+ }
+ }
+ writeback_ctx_end(cache, 0);
+}
+
+static int cache_kset_insert_tree(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia)
+{
+ struct pcache_cache_key_onmedia *key_onmedia;
+ struct pcache_cache_subtree *cache_subtree;
+ struct pcache_cache_key *key;
+ int ret;
+ u32 i;
+
+ /* Iterate through all keys in the kset and write each back to storage */
+ for (i = 0; i < kset_onmedia->key_num; i++) {
+ key_onmedia = &kset_onmedia->data[i];
+
+ key = cache_key_alloc(&cache->writeback_key_tree, GFP_NOIO);
+ ret = cache_key_decode(cache, key_onmedia, key);
+ if (ret) {
+ cache_key_put(key);
+ goto clear_tree;
+ }
+
+ cache_subtree = get_subtree(&cache->writeback_key_tree, key->off);
+ spin_lock(&cache_subtree->tree_lock);
+ cache_key_insert(&cache->writeback_key_tree, key, true);
+ spin_unlock(&cache_subtree->tree_lock);
+ }
+
+ return 0;
+clear_tree:
+ cache_tree_clear(&cache->writeback_key_tree);
+ return ret;
+}
+
+static void last_kset_writeback(struct pcache_cache *cache,
+ struct pcache_cache_kset_onmedia *last_kset_onmedia)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_segment *next_seg;
+
+ pcache_dev_debug(pcache, "last kset, next: %u\n", last_kset_onmedia->next_cache_seg_id);
+
+ next_seg = &cache->segments[last_kset_onmedia->next_cache_seg_id];
+
+ mutex_lock(&cache->dirty_tail_lock);
+ cache->dirty_tail.cache_seg = next_seg;
+ cache->dirty_tail.seg_off = 0;
+ cache_encode_dirty_tail(cache);
+ mutex_unlock(&cache->dirty_tail_lock);
+}
+
+void cache_writeback_fn(struct work_struct *work)
+{
+ struct pcache_cache *cache = container_of(work, struct pcache_cache, writeback_work.work);
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_pos dirty_tail;
+ struct pcache_cache_kset_onmedia *kset_onmedia;
+ u32 delay;
+ int ret;
+
+ mutex_lock(&cache->writeback_lock);
+ if (atomic_read(&cache->writeback_ctx.pending))
+ goto unlock;
+
+ if (pcache_is_stopping(pcache))
+ goto unlock;
+
+ kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf;
+
+ mutex_lock(&cache->dirty_tail_lock);
+ cache_pos_copy(&dirty_tail, &cache->dirty_tail);
+ mutex_unlock(&cache->dirty_tail_lock);
+
+ if (is_cache_clean(cache, &dirty_tail)) {
+ delay = PCACHE_CACHE_WRITEBACK_INTERVAL;
+ goto queue_work;
+ }
+
+ if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) {
+ last_kset_writeback(cache, kset_onmedia);
+ delay = 0;
+ goto queue_work;
+ }
+
+ ret = cache_kset_insert_tree(cache, kset_onmedia);
+ if (ret) {
+ delay = PCACHE_CACHE_WRITEBACK_INTERVAL;
+ goto queue_work;
+ }
+
+ cache_wb_tree_writeback(cache, get_kset_onmedia_size(kset_onmedia));
+ delay = 0;
+queue_work:
+ queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, delay);
+unlock:
+ mutex_unlock(&cache->writeback_lock);
+}
diff --git a/drivers/md/dm-pcache/dm_pcache.c b/drivers/md/dm-pcache/dm_pcache.c
new file mode 100644
index 000000000000..e5f5936fa6f0
--- /dev/null
+++ b/drivers/md/dm-pcache/dm_pcache.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+
+#include "../dm-core.h"
+#include "cache_dev.h"
+#include "backing_dev.h"
+#include "cache.h"
+#include "dm_pcache.h"
+
+void pcache_defer_reqs_kick(struct dm_pcache *pcache)
+{
+ struct pcache_cache *cache = &pcache->cache;
+
+ spin_lock(&cache->seg_map_lock);
+ if (!cache->cache_full)
+ queue_work(pcache->task_wq, &pcache->defered_req_work);
+ spin_unlock(&cache->seg_map_lock);
+}
+
+static void defer_req(struct pcache_request *pcache_req)
+{
+ struct dm_pcache *pcache = pcache_req->pcache;
+
+ BUG_ON(!list_empty(&pcache_req->list_node));
+
+ spin_lock(&pcache->defered_req_list_lock);
+ list_add(&pcache_req->list_node, &pcache->defered_req_list);
+ pcache_defer_reqs_kick(pcache);
+ spin_unlock(&pcache->defered_req_list_lock);
+}
+
+static void defered_req_fn(struct work_struct *work)
+{
+ struct dm_pcache *pcache = container_of(work, struct dm_pcache, defered_req_work);
+ struct pcache_request *pcache_req;
+ LIST_HEAD(tmp_list);
+ int ret;
+
+ if (pcache_is_stopping(pcache))
+ return;
+
+ spin_lock(&pcache->defered_req_list_lock);
+ list_splice_init(&pcache->defered_req_list, &tmp_list);
+ spin_unlock(&pcache->defered_req_list_lock);
+
+ while (!list_empty(&tmp_list)) {
+ pcache_req = list_first_entry(&tmp_list,
+ struct pcache_request, list_node);
+ list_del_init(&pcache_req->list_node);
+ pcache_req->ret = 0;
+ ret = pcache_cache_handle_req(&pcache->cache, pcache_req);
+ if (ret == -EBUSY)
+ defer_req(pcache_req);
+ else
+ pcache_req_put(pcache_req, ret);
+ }
+}
+
+void pcache_req_get(struct pcache_request *pcache_req)
+{
+ kref_get(&pcache_req->ref);
+}
+
+static void end_req(struct kref *ref)
+{
+ struct pcache_request *pcache_req = container_of(ref, struct pcache_request, ref);
+ struct dm_pcache *pcache = pcache_req->pcache;
+ struct bio *bio = pcache_req->bio;
+ int ret = pcache_req->ret;
+
+ if (ret == -EBUSY) {
+ pcache_req_get(pcache_req);
+ defer_req(pcache_req);
+ } else {
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
+
+ if (atomic_dec_and_test(&pcache->inflight_reqs))
+ wake_up(&pcache->inflight_wq);
+ }
+}
+
+void pcache_req_put(struct pcache_request *pcache_req, int ret)
+{
+ /* Set the return status if it is not already set */
+ if (ret && !pcache_req->ret)
+ pcache_req->ret = ret;
+
+ kref_put(&pcache_req->ref, end_req);
+}
+
+static bool at_least_one_arg(struct dm_arg_set *as, char **error)
+{
+ if (!as->argc) {
+ *error = "Insufficient args";
+ return false;
+ }
+
+ return true;
+}
+
+static int parse_cache_dev(struct dm_pcache *pcache, struct dm_arg_set *as,
+ char **error)
+{
+ int ret;
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+ ret = dm_get_device(pcache->ti, dm_shift_arg(as),
+ BLK_OPEN_READ | BLK_OPEN_WRITE,
+ &pcache->cache_dev.dm_dev);
+ if (ret) {
+ *error = "Error opening cache device";
+ return ret;
+ }
+
+ return 0;
+}
+
+static int parse_backing_dev(struct dm_pcache *pcache, struct dm_arg_set *as,
+ char **error)
+{
+ int ret;
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+
+ ret = dm_get_device(pcache->ti, dm_shift_arg(as),
+ BLK_OPEN_READ | BLK_OPEN_WRITE,
+ &pcache->backing_dev.dm_dev);
+ if (ret) {
+ *error = "Error opening backing device";
+ return ret;
+ }
+
+ return 0;
+}
+
+static void pcache_init_opts(struct pcache_cache_options *opts)
+{
+ opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK;
+ opts->data_crc = false;
+}
+
+static int parse_cache_opts(struct dm_pcache *pcache, struct dm_arg_set *as,
+ char **error)
+{
+ struct pcache_cache_options *opts = &pcache->opts;
+ static const struct dm_arg _args[] = {
+ {0, 4, "Invalid number of cache option arguments"},
+ };
+ unsigned int argc;
+ const char *arg;
+ int ret;
+
+ pcache_init_opts(opts);
+ if (!as->argc)
+ return 0;
+
+ ret = dm_read_arg_group(_args, as, &argc, error);
+ if (ret)
+ return -EINVAL;
+
+ while (argc) {
+ arg = dm_shift_arg(as);
+ argc--;
+
+ if (!strcmp(arg, "cache_mode")) {
+ arg = dm_shift_arg(as);
+ if (!strcmp(arg, "writeback")) {
+ opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK;
+ } else {
+ *error = "Invalid cache mode parameter";
+ return -EINVAL;
+ }
+ argc--;
+ } else if (!strcmp(arg, "data_crc")) {
+ arg = dm_shift_arg(as);
+ if (!strcmp(arg, "true")) {
+ opts->data_crc = true;
+ } else if (!strcmp(arg, "false")) {
+ opts->data_crc = false;
+ } else {
+ *error = "Invalid data crc parameter";
+ return -EINVAL;
+ }
+ argc--;
+ } else {
+ *error = "Unrecognised cache option requested";
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int pcache_start(struct dm_pcache *pcache, char **error)
+{
+ int ret;
+
+ ret = cache_dev_start(pcache);
+ if (ret) {
+ *error = "Failed to start cache dev";
+ return ret;
+ }
+
+ ret = backing_dev_start(pcache);
+ if (ret) {
+ *error = "Failed to start backing dev";
+ goto stop_cache;
+ }
+
+ ret = pcache_cache_start(pcache);
+ if (ret) {
+ *error = "Failed to start pcache";
+ goto stop_backing;
+ }
+
+ return 0;
+stop_backing:
+ backing_dev_stop(pcache);
+stop_cache:
+ cache_dev_stop(pcache);
+
+ return ret;
+}
+
+static void pcache_destroy_args(struct dm_pcache *pcache)
+{
+ if (pcache->cache_dev.dm_dev)
+ dm_put_device(pcache->ti, pcache->cache_dev.dm_dev);
+ if (pcache->backing_dev.dm_dev)
+ dm_put_device(pcache->ti, pcache->backing_dev.dm_dev);
+}
+
+static int pcache_parse_args(struct dm_pcache *pcache, unsigned int argc, char **argv,
+ char **error)
+{
+ struct dm_arg_set as;
+ int ret;
+
+ as.argc = argc;
+ as.argv = argv;
+
+ /*
+ * Parse cache device
+ */
+ ret = parse_cache_dev(pcache, &as, error);
+ if (ret)
+ return ret;
+ /*
+ * Parse backing device
+ */
+ ret = parse_backing_dev(pcache, &as, error);
+ if (ret)
+ goto out;
+ /*
+ * Parse optional arguments
+ */
+ ret = parse_cache_opts(pcache, &as, error);
+ if (ret)
+ goto out;
+
+ return 0;
+out:
+ pcache_destroy_args(pcache);
+ return ret;
+}
+
+static int dm_pcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct mapped_device *md = ti->table->md;
+ struct dm_pcache *pcache;
+ int ret;
+
+ if (md->map) {
+ ti->error = "Don't support table loading for live md";
+ return -EOPNOTSUPP;
+ }
+
+ /* Allocate memory for the cache structure */
+ pcache = kzalloc(sizeof(struct dm_pcache), GFP_KERNEL);
+ if (!pcache)
+ return -ENOMEM;
+
+ pcache->task_wq = alloc_workqueue("pcache-%s-wq", WQ_UNBOUND | WQ_MEM_RECLAIM,
+ 0, md->name);
+ if (!pcache->task_wq) {
+ ret = -ENOMEM;
+ goto free_pcache;
+ }
+
+ spin_lock_init(&pcache->defered_req_list_lock);
+ INIT_LIST_HEAD(&pcache->defered_req_list);
+ INIT_WORK(&pcache->defered_req_work, defered_req_fn);
+ pcache->ti = ti;
+
+ ret = pcache_parse_args(pcache, argc, argv, &ti->error);
+ if (ret)
+ goto destroy_wq;
+
+ ret = pcache_start(pcache, &ti->error);
+ if (ret)
+ goto destroy_args;
+
+ ti->num_flush_bios = 1;
+ ti->flush_supported = true;
+ ti->per_io_data_size = sizeof(struct pcache_request);
+ ti->private = pcache;
+ atomic_set(&pcache->inflight_reqs, 0);
+ atomic_set(&pcache->state, PCACHE_STATE_RUNNING);
+ init_waitqueue_head(&pcache->inflight_wq);
+
+ return 0;
+destroy_args:
+ pcache_destroy_args(pcache);
+destroy_wq:
+ destroy_workqueue(pcache->task_wq);
+free_pcache:
+ kfree(pcache);
+
+ return ret;
+}
+
+static void defer_req_stop(struct dm_pcache *pcache)
+{
+ struct pcache_request *pcache_req;
+ LIST_HEAD(tmp_list);
+
+ flush_work(&pcache->defered_req_work);
+
+ spin_lock(&pcache->defered_req_list_lock);
+ list_splice_init(&pcache->defered_req_list, &tmp_list);
+ spin_unlock(&pcache->defered_req_list_lock);
+
+ while (!list_empty(&tmp_list)) {
+ pcache_req = list_first_entry(&tmp_list,
+ struct pcache_request, list_node);
+ list_del_init(&pcache_req->list_node);
+ pcache_req_put(pcache_req, -EIO);
+ }
+}
+
+static void dm_pcache_dtr(struct dm_target *ti)
+{
+ struct dm_pcache *pcache;
+
+ pcache = ti->private;
+ atomic_set(&pcache->state, PCACHE_STATE_STOPPING);
+ defer_req_stop(pcache);
+
+ wait_event(pcache->inflight_wq,
+ atomic_read(&pcache->inflight_reqs) == 0);
+
+ pcache_cache_stop(pcache);
+ backing_dev_stop(pcache);
+ cache_dev_stop(pcache);
+
+ pcache_destroy_args(pcache);
+ drain_workqueue(pcache->task_wq);
+ destroy_workqueue(pcache->task_wq);
+
+ kfree(pcache);
+}
+
+static int dm_pcache_map_bio(struct dm_target *ti, struct bio *bio)
+{
+ struct pcache_request *pcache_req = dm_per_bio_data(bio, sizeof(struct pcache_request));
+ struct dm_pcache *pcache = ti->private;
+ int ret;
+
+ pcache_req->pcache = pcache;
+ kref_init(&pcache_req->ref);
+ pcache_req->ret = 0;
+ pcache_req->bio = bio;
+ pcache_req->off = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ pcache_req->data_len = bio->bi_iter.bi_size;
+ INIT_LIST_HEAD(&pcache_req->list_node);
+ atomic_inc(&pcache->inflight_reqs);
+
+ ret = pcache_cache_handle_req(&pcache->cache, pcache_req);
+ if (ret == -EBUSY)
+ defer_req(pcache_req);
+ else
+ pcache_req_put(pcache_req, ret);
+
+ return DM_MAPIO_SUBMITTED;
+}
+
+static void dm_pcache_status(struct dm_target *ti, status_type_t type,
+ unsigned int status_flags, char *result,
+ unsigned int maxlen)
+{
+ struct dm_pcache *pcache = ti->private;
+ struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
+ struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+ struct pcache_cache *cache = &pcache->cache;
+ unsigned int sz = 0;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%x %u %u %u %u %x %u:%u %u:%u %u:%u",
+ cache_dev->sb_flags,
+ cache_dev->seg_num,
+ cache->n_segs,
+ bitmap_weight(cache->seg_map, cache->n_segs),
+ pcache_cache_get_gc_percent(cache),
+ cache->cache_info.flags,
+ cache->key_head.cache_seg->cache_seg_id,
+ cache->key_head.seg_off,
+ cache->dirty_tail.cache_seg->cache_seg_id,
+ cache->dirty_tail.seg_off,
+ cache->key_tail.cache_seg->cache_seg_id,
+ cache->key_tail.seg_off);
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %s 4 cache_mode writeback crc %s",
+ cache_dev->dm_dev->name,
+ backing_dev->dm_dev->name,
+ cache_data_crc_on(cache) ? "true" : "false");
+ break;
+ case STATUSTYPE_IMA:
+ *result = '\0';
+ break;
+ }
+}
+
+static int dm_pcache_message(struct dm_target *ti, unsigned int argc,
+ char **argv, char *result, unsigned int maxlen)
+{
+ struct dm_pcache *pcache = ti->private;
+ unsigned long val;
+
+ if (argc != 2)
+ goto err;
+
+ if (!strcasecmp(argv[0], "gc_percent")) {
+ if (kstrtoul(argv[1], 10, &val))
+ goto err;
+
+ return pcache_cache_set_gc_percent(&pcache->cache, val);
+ }
+err:
+ return -EINVAL;
+}
+
+static struct target_type dm_pcache_target = {
+ .name = "pcache",
+ .version = {0, 1, 0},
+ .module = THIS_MODULE,
+ .features = DM_TARGET_SINGLETON,
+ .ctr = dm_pcache_ctr,
+ .dtr = dm_pcache_dtr,
+ .map = dm_pcache_map_bio,
+ .status = dm_pcache_status,
+ .message = dm_pcache_message,
+};
+
+static int __init dm_pcache_init(void)
+{
+ int ret;
+
+ ret = pcache_backing_init();
+ if (ret)
+ goto err;
+
+ ret = pcache_cache_init();
+ if (ret)
+ goto backing_exit;
+
+ ret = dm_register_target(&dm_pcache_target);
+ if (ret)
+ goto cache_exit;
+ return 0;
+
+cache_exit:
+ pcache_cache_exit();
+backing_exit:
+ pcache_backing_exit();
+err:
+ return ret;
+}
+module_init(dm_pcache_init);
+
+static void __exit dm_pcache_exit(void)
+{
+ dm_unregister_target(&dm_pcache_target);
+ pcache_cache_exit();
+ pcache_backing_exit();
+}
+module_exit(dm_pcache_exit);
+
+MODULE_DESCRIPTION("dm-pcache Persistent Cache for block device");
+MODULE_AUTHOR("Dongsheng Yang <dongsheng.yang@linux.dev>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-pcache/dm_pcache.h b/drivers/md/dm-pcache/dm_pcache.h
new file mode 100644
index 000000000000..b4e06be0c0b9
--- /dev/null
+++ b/drivers/md/dm-pcache/dm_pcache.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _DM_PCACHE_H
+#define _DM_PCACHE_H
+#include <linux/device-mapper.h>
+
+#include "../dm-core.h"
+
+#define CACHE_DEV_TO_PCACHE(cache_dev) (container_of(cache_dev, struct dm_pcache, cache_dev))
+#define BACKING_DEV_TO_PCACHE(backing_dev) (container_of(backing_dev, struct dm_pcache, backing_dev))
+#define CACHE_TO_PCACHE(cache) (container_of(cache, struct dm_pcache, cache))
+
+#define PCACHE_STATE_RUNNING 1
+#define PCACHE_STATE_STOPPING 2
+
+struct pcache_cache_dev;
+struct pcache_backing_dev;
+struct pcache_cache;
+struct pcache_cache_options;
+struct dm_pcache {
+ struct dm_target *ti;
+ struct pcache_cache_dev cache_dev;
+ struct pcache_backing_dev backing_dev;
+ struct pcache_cache cache;
+ struct pcache_cache_options opts;
+
+ spinlock_t defered_req_list_lock;
+ struct list_head defered_req_list;
+ struct workqueue_struct *task_wq;
+
+ struct work_struct defered_req_work;
+
+ atomic_t state;
+ atomic_t inflight_reqs;
+ wait_queue_head_t inflight_wq;
+};
+
+static inline bool pcache_is_stopping(struct dm_pcache *pcache)
+{
+ return (atomic_read(&pcache->state) == PCACHE_STATE_STOPPING);
+}
+
+#define pcache_dev_err(pcache, fmt, ...) \
+ pcache_err("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__)
+#define pcache_dev_info(pcache, fmt, ...) \
+ pcache_info("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__)
+#define pcache_dev_debug(pcache, fmt, ...) \
+ pcache_debug("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__)
+
+struct pcache_request {
+ struct dm_pcache *pcache;
+ struct bio *bio;
+
+ u64 off;
+ u32 data_len;
+
+ struct kref ref;
+ int ret;
+
+ struct list_head list_node;
+};
+
+void pcache_req_get(struct pcache_request *pcache_req);
+void pcache_req_put(struct pcache_request *pcache_req, int ret);
+
+void pcache_defer_reqs_kick(struct dm_pcache *pcache);
+
+#endif /* _DM_PCACHE_H */
diff --git a/drivers/md/dm-pcache/pcache_internal.h b/drivers/md/dm-pcache/pcache_internal.h
new file mode 100644
index 000000000000..b7a3319d2bd3
--- /dev/null
+++ b/drivers/md/dm-pcache/pcache_internal.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _PCACHE_INTERNAL_H
+#define _PCACHE_INTERNAL_H
+
+#include <linux/delay.h>
+#include <linux/crc32c.h>
+
+#define pcache_err(fmt, ...) \
+ pr_err("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__)
+#define pcache_info(fmt, ...) \
+ pr_info("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__)
+#define pcache_debug(fmt, ...) \
+ pr_debug("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__)
+
+#define PCACHE_KB (1024ULL)
+#define PCACHE_MB (1024 * PCACHE_KB)
+
+/* Maximum number of metadata indices */
+#define PCACHE_META_INDEX_MAX 2
+
+#define PCACHE_CRC_SEED 0x3B15A
+/*
+ * struct pcache_meta_header - PCACHE metadata header structure
+ * @crc: CRC checksum for validating metadata integrity.
+ * @seq: Sequence number to track metadata updates.
+ * @version: Metadata version.
+ * @res: Reserved space for future use.
+ */
+struct pcache_meta_header {
+ __u32 crc;
+ __u8 seq;
+ __u8 version;
+ __u16 res;
+};
+
+/*
+ * pcache_meta_crc - Calculate CRC for the given metadata header.
+ * @header: Pointer to the metadata header.
+ * @meta_size: Size of the metadata structure.
+ *
+ * Returns the CRC checksum calculated by excluding the CRC field itself.
+ */
+static inline u32 pcache_meta_crc(struct pcache_meta_header *header, u32 meta_size)
+{
+ return crc32c(PCACHE_CRC_SEED, (void *)header + 4, meta_size - 4);
+}
+
+/*
+ * pcache_meta_seq_after - Check if a sequence number is more recent, accounting for overflow.
+ * @seq1: First sequence number.
+ * @seq2: Second sequence number.
+ *
+ * Determines if @seq1 is more recent than @seq2 by calculating the signed
+ * difference between them. This approach allows handling sequence number
+ * overflow correctly because the difference wraps naturally, and any value
+ * greater than zero indicates that @seq1 is "after" @seq2. This method
+ * assumes 8-bit unsigned sequence numbers, where the difference wraps
+ * around if seq1 overflows past seq2.
+ *
+ * Returns:
+ * - true if @seq1 is more recent than @seq2, indicating it comes "after"
+ * - false otherwise.
+ */
+static inline bool pcache_meta_seq_after(u8 seq1, u8 seq2)
+{
+ return (s8)(seq1 - seq2) > 0;
+}
+
+/*
+ * pcache_meta_find_latest - Find the latest valid metadata.
+ * @header: Pointer to the metadata header.
+ * @meta_size: Size of each metadata block.
+ *
+ * Finds the latest valid metadata by checking sequence numbers. If a
+ * valid entry with the highest sequence number is found, its pointer
+ * is returned. Returns NULL if no valid metadata is found.
+ */
+static inline void __must_check *pcache_meta_find_latest(struct pcache_meta_header *header,
+ u32 meta_size, u32 meta_max_size,
+ void *meta_ret)
+{
+ struct pcache_meta_header *meta, *latest = NULL;
+ u32 i, seq_latest = 0;
+ void *meta_addr;
+
+ meta = meta_ret;
+
+ for (i = 0; i < PCACHE_META_INDEX_MAX; i++) {
+ meta_addr = (void *)header + (i * meta_max_size);
+ if (copy_mc_to_kernel(meta, meta_addr, meta_size)) {
+ pcache_err("hardware memory error when copy meta");
+ return ERR_PTR(-EIO);
+ }
+
+ /* Skip if CRC check fails, which means corrupted */
+ if (meta->crc != pcache_meta_crc(meta, meta_size))
+ continue;
+
+ /* Update latest if a more recent sequence is found */
+ if (!latest || pcache_meta_seq_after(meta->seq, seq_latest)) {
+ seq_latest = meta->seq;
+ latest = meta_addr;
+ }
+ }
+
+ if (!latest)
+ return NULL;
+
+ if (copy_mc_to_kernel(meta_ret, latest, meta_size)) {
+ pcache_err("hardware memory error");
+ return ERR_PTR(-EIO);
+ }
+
+ return latest;
+}
+
+#endif /* _PCACHE_INTERNAL_H */
diff --git a/drivers/md/dm-pcache/segment.c b/drivers/md/dm-pcache/segment.c
new file mode 100644
index 000000000000..7e9818701445
--- /dev/null
+++ b/drivers/md/dm-pcache/segment.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/dax.h>
+
+#include "pcache_internal.h"
+#include "cache_dev.h"
+#include "segment.h"
+
+int segment_copy_to_bio(struct pcache_segment *segment,
+ u32 data_off, u32 data_len, struct bio *bio, u32 bio_off)
+{
+ struct iov_iter iter;
+ size_t copied;
+ void *src;
+
+ iov_iter_bvec(&iter, ITER_DEST, &bio->bi_io_vec[bio->bi_iter.bi_idx],
+ bio_segments(bio), bio->bi_iter.bi_size);
+ iter.iov_offset = bio->bi_iter.bi_bvec_done;
+ if (bio_off)
+ iov_iter_advance(&iter, bio_off);
+
+ src = segment->data + data_off;
+ copied = _copy_mc_to_iter(src, data_len, &iter);
+ if (copied != data_len)
+ return -EIO;
+
+ return 0;
+}
+
+int segment_copy_from_bio(struct pcache_segment *segment,
+ u32 data_off, u32 data_len, struct bio *bio, u32 bio_off)
+{
+ struct iov_iter iter;
+ size_t copied;
+ void *dst;
+
+ iov_iter_bvec(&iter, ITER_SOURCE, &bio->bi_io_vec[bio->bi_iter.bi_idx],
+ bio_segments(bio), bio->bi_iter.bi_size);
+ iter.iov_offset = bio->bi_iter.bi_bvec_done;
+ if (bio_off)
+ iov_iter_advance(&iter, bio_off);
+
+ dst = segment->data + data_off;
+ copied = _copy_from_iter_flushcache(dst, data_len, &iter);
+ if (copied != data_len)
+ return -EIO;
+ pmem_wmb();
+
+ return 0;
+}
+
+void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment,
+ struct pcache_segment_init_options *options)
+{
+ segment->seg_info = options->seg_info;
+ segment_info_set_type(segment->seg_info, options->type);
+
+ segment->cache_dev = cache_dev;
+ segment->seg_id = options->seg_id;
+ segment->data_size = PCACHE_SEG_SIZE - options->data_off;
+ segment->data = CACHE_DEV_SEGMENT(cache_dev, options->seg_id) + options->data_off;
+}
diff --git a/drivers/md/dm-pcache/segment.h b/drivers/md/dm-pcache/segment.h
new file mode 100644
index 000000000000..deca1ddcb02b
--- /dev/null
+++ b/drivers/md/dm-pcache/segment.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _PCACHE_SEGMENT_H
+#define _PCACHE_SEGMENT_H
+
+#include <linux/bio.h>
+#include <linux/bitfield.h>
+
+#include "pcache_internal.h"
+
+struct pcache_segment_info {
+ struct pcache_meta_header header;
+ __u32 flags;
+ __u32 next_seg;
+};
+
+#define PCACHE_SEG_INFO_FLAGS_HAS_NEXT BIT(0)
+
+#define PCACHE_SEG_INFO_FLAGS_TYPE_MASK GENMASK(4, 1)
+#define PCACHE_SEGMENT_TYPE_CACHE_DATA 1
+
+static inline bool segment_info_has_next(struct pcache_segment_info *seg_info)
+{
+ return (seg_info->flags & PCACHE_SEG_INFO_FLAGS_HAS_NEXT);
+}
+
+static inline void segment_info_set_type(struct pcache_segment_info *seg_info, u8 type)
+{
+ seg_info->flags &= ~PCACHE_SEG_INFO_FLAGS_TYPE_MASK;
+ seg_info->flags |= FIELD_PREP(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, type);
+}
+
+static inline u8 segment_info_get_type(struct pcache_segment_info *seg_info)
+{
+ return FIELD_GET(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, seg_info->flags);
+}
+
+struct pcache_segment_pos {
+ struct pcache_segment *segment; /* Segment associated with the position */
+ u32 off; /* Offset within the segment */
+};
+
+struct pcache_segment_init_options {
+ u8 type;
+ u32 seg_id;
+ u32 data_off;
+
+ struct pcache_segment_info *seg_info;
+};
+
+struct pcache_segment {
+ struct pcache_cache_dev *cache_dev;
+
+ void *data;
+ u32 data_size;
+ u32 seg_id;
+
+ struct pcache_segment_info *seg_info;
+};
+
+int segment_copy_to_bio(struct pcache_segment *segment,
+ u32 data_off, u32 data_len, struct bio *bio, u32 bio_off);
+int segment_copy_from_bio(struct pcache_segment *segment,
+ u32 data_off, u32 data_len, struct bio *bio, u32 bio_off);
+
+static inline void segment_pos_advance(struct pcache_segment_pos *seg_pos, u32 len)
+{
+ BUG_ON(seg_pos->off + len > seg_pos->segment->data_size);
+
+ seg_pos->off += len;
+}
+
+void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment,
+ struct pcache_segment_init_options *options);
+#endif /* _PCACHE_SEGMENT_H */
diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c
index b49e10d76d03..f07e773d9cc0 100644
--- a/drivers/md/dm-ps-historical-service-time.c
+++ b/drivers/md/dm-ps-historical-service-time.c
@@ -541,8 +541,10 @@ static int __init dm_hst_init(void)
{
int r = dm_register_path_selector(&hst_ps);
- if (r < 0)
+ if (r < 0) {
DMERR("register failed %d", r);
+ return r;
+ }
DMINFO("version " HST_VERSION " loaded");
@@ -551,10 +553,7 @@ static int __init dm_hst_init(void)
static void __exit dm_hst_exit(void)
{
- int r = dm_unregister_path_selector(&hst_ps);
-
- if (r < 0)
- DMERR("unregister failed %d", r);
+ dm_unregister_path_selector(&hst_ps);
}
module_init(dm_hst_init);
diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c
index 461ee6b2044d..80415a045c68 100644
--- a/drivers/md/dm-ps-io-affinity.c
+++ b/drivers/md/dm-ps-io-affinity.c
@@ -116,7 +116,7 @@ static int ioa_create(struct path_selector *ps, unsigned int argc, char **argv)
if (!s)
return -ENOMEM;
- s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *),
+ s->path_map = kcalloc(nr_cpu_ids, sizeof(struct path_info *),
GFP_KERNEL);
if (!s->path_map)
goto free_selector;
@@ -260,10 +260,7 @@ static int __init dm_ioa_init(void)
static void __exit dm_ioa_exit(void)
{
- int ret = dm_unregister_path_selector(&ioa_ps);
-
- if (ret < 0)
- DMERR("unregister failed %d", ret);
+ dm_unregister_path_selector(&ioa_ps);
}
module_init(dm_ioa_init);
diff --git a/drivers/md/dm-ps-queue-length.c b/drivers/md/dm-ps-queue-length.c
index e305f05ad1e5..9c68701ed7a4 100644
--- a/drivers/md/dm-ps-queue-length.c
+++ b/drivers/md/dm-ps-queue-length.c
@@ -260,8 +260,10 @@ static int __init dm_ql_init(void)
{
int r = dm_register_path_selector(&ql_ps);
- if (r < 0)
+ if (r < 0) {
DMERR("register failed %d", r);
+ return r;
+ }
DMINFO("version " QL_VERSION " loaded");
@@ -270,10 +272,7 @@ static int __init dm_ql_init(void)
static void __exit dm_ql_exit(void)
{
- int r = dm_unregister_path_selector(&ql_ps);
-
- if (r < 0)
- DMERR("unregister failed %d", r);
+ dm_unregister_path_selector(&ql_ps);
}
module_init(dm_ql_init);
diff --git a/drivers/md/dm-ps-round-robin.c b/drivers/md/dm-ps-round-robin.c
index d1745b123dc1..0c12f4073461 100644
--- a/drivers/md/dm-ps-round-robin.c
+++ b/drivers/md/dm-ps-round-robin.c
@@ -220,8 +220,10 @@ static int __init dm_rr_init(void)
{
int r = dm_register_path_selector(&rr_ps);
- if (r < 0)
+ if (r < 0) {
DMERR("register failed %d", r);
+ return r;
+ }
DMINFO("version " RR_VERSION " loaded");
@@ -230,10 +232,7 @@ static int __init dm_rr_init(void)
static void __exit dm_rr_exit(void)
{
- int r = dm_unregister_path_selector(&rr_ps);
-
- if (r < 0)
- DMERR("unregister failed %d", r);
+ dm_unregister_path_selector(&rr_ps);
}
module_init(dm_rr_init);
diff --git a/drivers/md/dm-ps-service-time.c b/drivers/md/dm-ps-service-time.c
index 969d31c40272..0543fe7969c4 100644
--- a/drivers/md/dm-ps-service-time.c
+++ b/drivers/md/dm-ps-service-time.c
@@ -341,8 +341,10 @@ static int __init dm_st_init(void)
{
int r = dm_register_path_selector(&st_ps);
- if (r < 0)
+ if (r < 0) {
DMERR("register failed %d", r);
+ return r;
+ }
DMINFO("version " ST_VERSION " loaded");
@@ -351,10 +353,7 @@ static int __init dm_st_init(void)
static void __exit dm_st_exit(void)
{
- int r = dm_unregister_path_selector(&st_ps);
-
- if (r < 0)
- DMERR("unregister failed %d", r);
+ dm_unregister_path_selector(&st_ps);
}
module_init(dm_st_init);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1e0d3b9b75d6..c6f7129e43d3 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -438,7 +438,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
/* Return true, if raid set in @rs is recovering */
static bool rs_is_recovering(struct raid_set *rs)
{
- return rs->md.recovery_cp < rs->md.dev_sectors;
+ return rs->md.resync_offset < rs->md.dev_sectors;
}
/* Return true, if raid set in @rs is reshaping */
@@ -768,7 +768,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
rs->md.layout = raid_type->algorithm;
rs->md.new_layout = rs->md.layout;
rs->md.delta_disks = 0;
- rs->md.recovery_cp = MaxSector;
+ rs->md.resync_offset = MaxSector;
for (i = 0; i < raid_devs; i++)
md_rdev_init(&rs->dev[i].rdev);
@@ -912,7 +912,7 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
rs->md.external = 0;
rs->md.persistent = 1;
rs->md.major_version = 2;
- } else if (rebuild && !rs->md.recovery_cp) {
+ } else if (rebuild && !rs->md.resync_offset) {
/*
* Without metadata, we will not be able to tell if the array
* is in-sync or not - we must assume it is not. Therefore,
@@ -1355,11 +1355,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
return -EINVAL;
}
- /*
- * In device-mapper, we specify things in sectors, but
- * MD records this value in kB
- */
- if (value < 0 || value / 2 > COUNTER_MAX) {
+ if (value < 0) {
rs->ti->error = "Max write-behind limit out of range";
return -EINVAL;
}
@@ -1699,20 +1695,20 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
{
/* raid0 does not recover */
if (rs_is_raid0(rs))
- rs->md.recovery_cp = MaxSector;
+ rs->md.resync_offset = MaxSector;
/*
* A raid6 set has to be recovered either
* completely or for the grown part to
* ensure proper parity and Q-Syndrome
*/
else if (rs_is_raid6(rs))
- rs->md.recovery_cp = dev_sectors;
+ rs->md.resync_offset = dev_sectors;
/*
* Other raid set types may skip recovery
* depending on the 'nosync' flag.
*/
else
- rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)
+ rs->md.resync_offset = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)
? MaxSector : dev_sectors;
}
@@ -2147,7 +2143,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->events = cpu_to_le64(mddev->events);
sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
- sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
+ sb->array_resync_offset = cpu_to_le64(mddev->resync_offset);
sb->level = cpu_to_le32(mddev->level);
sb->layout = cpu_to_le32(mddev->layout);
@@ -2338,18 +2334,18 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
}
if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
- mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
+ mddev->resync_offset = le64_to_cpu(sb->array_resync_offset);
/*
* During load, we set FirstUse if a new superblock was written.
* There are two reasons we might not have a superblock:
* 1) The raid set is brand new - in which case, all of the
* devices must have their In_sync bit set. Also,
- * recovery_cp must be 0, unless forced.
+ * resync_offset must be 0, unless forced.
* 2) This is a new device being added to an old raid set
* and the new device needs to be rebuilt - in which
* case the In_sync bit will /not/ be set and
- * recovery_cp must be MaxSector.
+ * resync_offset must be MaxSector.
* 3) This is/are a new device(s) being added to an old
* raid set during takeover to a higher raid level
* to provide capacity for redundancy or during reshape
@@ -2394,8 +2390,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
new_devs > 1 ? "s" : "");
return -EINVAL;
} else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) {
- DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
- (unsigned long long) mddev->recovery_cp);
+ DMERR("'rebuild' specified while raid set is not in-sync (resync_offset=%llu)",
+ (unsigned long long) mddev->resync_offset);
return -EINVAL;
} else if (rs_is_reshaping(rs)) {
DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)",
@@ -2410,7 +2406,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
*/
sb_retrieve_failed_devices(sb, failed_devices);
rdev_for_each(r, mddev) {
- if (test_bit(Journal, &rdev->flags) ||
+ if (test_bit(Journal, &r->flags) ||
!r->sb_page)
continue;
sb2 = page_address(r->sb_page);
@@ -2535,6 +2531,10 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
struct md_rdev *rdev, *freshest;
struct mddev *mddev = &rs->md;
+ /* Respect resynchronization requested with "sync" argument. */
+ if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
+ set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+
freshest = NULL;
rdev_for_each(rdev, mddev) {
if (test_bit(Journal, &rdev->flags))
@@ -2700,11 +2700,11 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
}
out:
/*
- * Raise recovery_cp in case data_offset != 0 to
+ * Raise resync_offset in case data_offset != 0 to
* avoid false recovery positives in the constructor.
*/
- if (rs->md.recovery_cp < rs->md.dev_sectors)
- rs->md.recovery_cp += rs->dev[0].rdev.data_offset;
+ if (rs->md.resync_offset < rs->md.dev_sectors)
+ rs->md.resync_offset += rs->dev[0].rdev.data_offset;
/* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
rdev_for_each(rdev, &rs->md) {
@@ -2759,7 +2759,7 @@ static int rs_setup_takeover(struct raid_set *rs)
}
clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
while (d--) {
rdev = &rs->dev[d].rdev;
@@ -2767,7 +2767,7 @@ static int rs_setup_takeover(struct raid_set *rs)
if (test_bit(d, (void *) rs->rebuild_disks)) {
clear_bit(In_sync, &rdev->flags);
clear_bit(Faulty, &rdev->flags);
- mddev->recovery_cp = rdev->recovery_offset = 0;
+ mddev->resync_offset = rdev->recovery_offset = 0;
/* Bitmap has to be created when we do an "up" takeover */
set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
}
@@ -3196,7 +3196,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (reshape_sectors || rs_is_raid1(rs)) {
/*
* We can only prepare for a reshape here, because the
- * raid set needs to run to provide the repective reshape
+ * raid set needs to run to provide the respective reshape
* check functions via its MD personality instance.
*
* So do the reshape check after md_run() succeeded.
@@ -3225,7 +3225,7 @@ size_check:
if (r)
goto bad;
- rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors);
+ rs_setup_recovery(rs, rs->md.resync_offset < rs->md.dev_sectors ? rs->md.resync_offset : rs->md.dev_sectors);
} else {
/* This is no size change or it is shrinking, update size and record in superblocks */
r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
@@ -3247,7 +3247,7 @@ size_check:
rs_reset_inconclusive_reshape(rs);
/* Start raid set read-only and assumed clean to change in raid_resume() */
- rs->md.ro = 1;
+ rs->md.ro = MD_RDONLY;
rs->md.in_sync = 1;
/* Has to be held on running the array */
@@ -3308,6 +3308,7 @@ size_check:
/* Disable/enable discard support on raid set. */
configure_discard_support(rs);
+ rs->md.dm_gendisk = dm_disk(dm_table_get_md(ti->table));
mddev_unlock(&rs->md);
return 0;
@@ -3327,6 +3328,7 @@ static void raid_dtr(struct dm_target *ti)
mddev_lock_nointr(&rs->md);
md_stop(&rs->md);
+ rs->md.dm_gendisk = NULL;
mddev_unlock(&rs->md);
if (work_pending(&rs->md.event_work))
@@ -3383,7 +3385,7 @@ static enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long r
/* The MD sync thread can be done with io or be interrupted but still be running */
if (!test_bit(MD_RECOVERY_DONE, &recovery) &&
(test_bit(MD_RECOVERY_RUNNING, &recovery) ||
- (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
+ (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
return st_reshape;
@@ -3447,7 +3449,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
} else {
if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery))
- r = mddev->recovery_cp;
+ r = mddev->resync_offset;
else
r = mddev->curr_resync_completed;
@@ -3773,11 +3775,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
} else
return -EINVAL;
}
- if (mddev->ro == 2) {
+ if (mddev->ro == MD_AUTO_READ) {
/* A write to sync_action is enough to justify
* canceling read-auto mode
*/
- mddev->ro = 0;
+ mddev->ro = MD_RDWR;
if (!mddev->suspended)
md_wakeup_thread(mddev->sync_thread);
}
@@ -3811,8 +3813,10 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
struct raid_set *rs = ti->private;
unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors);
- limits->io_min = chunk_size_bytes;
- limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs);
+ if (chunk_size_bytes) {
+ limits->io_min = chunk_size_bytes;
+ limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs);
+ }
}
static void raid_presuspend(struct dm_target *ti)
@@ -3856,6 +3860,7 @@ static void raid_postsuspend(struct dm_target *ti)
*/
md_stop_writes(&rs->md);
mddev_suspend(&rs->md, false);
+ rs->md.ro = MD_RDONLY;
}
}
@@ -3951,9 +3956,11 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
!test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
struct mddev *mddev = &rs->md;
- r = mddev->bitmap_ops->load(mddev);
- if (r)
- DMERR("Failed to load bitmap");
+ if (md_bitmap_enabled(mddev, false)) {
+ r = mddev->bitmap_ops->load(mddev);
+ if (r)
+ DMERR("Failed to load bitmap");
+ }
}
return r;
@@ -3966,7 +3973,7 @@ static void rs_update_sbs(struct raid_set *rs)
int ro = mddev->ro;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
- mddev->ro = 0;
+ mddev->ro = MD_RDWR;
md_update_sb(mddev, 1);
mddev->ro = ro;
}
@@ -4068,16 +4075,18 @@ static int raid_preresume(struct dm_target *ti)
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;
- r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
- chunksize, false);
- if (r)
- DMERR("Failed to resize bitmap");
+ if (md_bitmap_enabled(mddev, false)) {
+ r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
+ chunksize);
+ if (r)
+ DMERR("Failed to resize bitmap");
+ }
}
/* Check for any resize/reshape on @rs and adjust/initiate */
- if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
+ if (mddev->resync_offset && mddev->resync_offset < MaxSector) {
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- mddev->resync_min = mddev->recovery_cp;
+ mddev->resync_min = mddev->resync_offset;
if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags))
mddev->resync_max_sectors = mddev->dev_sectors;
}
@@ -4123,7 +4132,7 @@ static void raid_resume(struct dm_target *ti)
WARN_ON_ONCE(rcu_dereference_protected(mddev->sync_thread,
lockdep_is_held(&mddev->reconfig_mutex)));
clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
- mddev->ro = 0;
+ mddev->ro = MD_RDWR;
mddev->in_sync = 0;
md_unfrozen_sync_thread(mddev);
mddev_unlock_and_resume(mddev);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9511dae5b556..268f734ca9c3 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -103,7 +103,7 @@ static void wakeup_mirrord(void *context)
static void delayed_wake_fn(struct timer_list *t)
{
- struct mirror_set *ms = from_timer(ms, t, timer);
+ struct mirror_set *ms = timer_container_of(ms, t, timer);
clear_bit(0, &ms->timer_pending);
wakeup_mirrord(ms);
@@ -133,10 +133,9 @@ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
spin_lock_irqsave(&ms->lock, flags);
should_wake = !(bl->head);
bio_list_add(bl, bio);
- spin_unlock_irqrestore(&ms->lock, flags);
-
if (should_wake)
wakeup_mirrord(ms);
+ spin_unlock_irqrestore(&ms->lock, flags);
}
static void dispatch_bios(void *context, struct bio_list *bio_list)
@@ -646,9 +645,9 @@ static void write_callback(unsigned long error, void *context)
if (!ms->failures.head)
should_wake = 1;
bio_list_add(&ms->failures, bio);
- spin_unlock_irqrestore(&ms->lock, flags);
if (should_wake)
wakeup_mirrord(ms);
+ spin_unlock_irqrestore(&ms->lock, flags);
}
static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -656,7 +655,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
unsigned int i;
struct dm_io_region io[MAX_NR_MIRRORS], *dest = io;
struct mirror *m;
- blk_opf_t op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH);
+ blk_opf_t op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH | REQ_ATOMIC);
struct dm_io_request io_req = {
.bi_opf = REQ_OP_WRITE | op_flags,
.mem.type = DM_IO_BIO,
@@ -1182,7 +1181,7 @@ static void mirror_dtr(struct dm_target *ti)
{
struct mirror_set *ms = ti->private;
- del_timer_sync(&ms->timer);
+ timer_delete_sync(&ms->timer);
flush_workqueue(ms->kmirrord_wq);
flush_work(&ms->trigger_event);
dm_kcopyd_client_destroy(ms->kcopyd_client);
@@ -1483,8 +1482,9 @@ static int mirror_iterate_devices(struct dm_target *ti,
static struct target_type mirror_target = {
.name = "mirror",
- .version = {1, 14, 0},
+ .version = {1, 15, 0},
.module = THIS_MODULE,
+ .features = DM_TARGET_ATOMIC_WRITES,
.ctr = mirror_ctr,
.dtr = mirror_dtr,
.map = mirror_map,
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index a4550975c27d..e9b47b659976 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -206,7 +206,7 @@ struct dm_region_hash *dm_region_hash_create(
rh->shift = RH_HASH_SHIFT;
rh->prime = RH_HASH_MULT;
- rh->buckets = vmalloc(array_size(nr_buckets, sizeof(*rh->buckets)));
+ rh->buckets = vmalloc_array(nr_buckets, sizeof(*rh->buckets));
if (!rh->buckets) {
DMERR("unable to allocate region hash bucket memory");
kfree(rh);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 499f8cc8a39f..a6ca92049c10 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -217,10 +217,10 @@ static void dm_done(struct request *clone, blk_status_t error, bool mapped)
if (unlikely(error == BLK_STS_TARGET)) {
if (req_op(clone) == REQ_OP_DISCARD &&
!clone->q->limits.max_discard_sectors)
- disable_discard(tio->md);
+ blk_queue_disable_discard(tio->md->queue);
else if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
!clone->q->limits.max_write_zeroes_sectors)
- disable_write_zeroes(tio->md);
+ blk_queue_disable_write_zeroes(tio->md->queue);
}
switch (r) {
@@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
md->tag_set->ops = &dm_mq_ops;
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
md->tag_set->numa_node = md->numa_node_id;
- md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
+ md->tag_set->flags = BLK_MQ_F_STACKING;
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
md->tag_set->driver_data = md;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 4112071de0be..1461dc740dae 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -316,7 +316,7 @@ static struct dax_device *stripe_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff)
static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, enum dax_access_mode mode, void **kaddr,
- pfn_t *pfn)
+ unsigned long *pfn)
{
struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff);
@@ -405,7 +405,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
blk_status_t *error)
{
unsigned int i;
- char major_minor[16];
+ char major_minor[22];
struct stripe_c *sc = ti->private;
if (!*error)
@@ -417,8 +417,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
if (*error == BLK_STS_NOTSUPP)
return DM_ENDIO_DONE;
- memset(major_minor, 0, sizeof(major_minor));
- sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)));
+ format_dev_t(major_minor, bio_dev(bio));
/*
* Test to see which stripe drive triggered the event
@@ -457,16 +456,22 @@ static void stripe_io_hints(struct dm_target *ti,
struct queue_limits *limits)
{
struct stripe_c *sc = ti->private;
- unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT;
+ unsigned int io_min, io_opt;
- limits->io_min = chunk_size;
- limits->io_opt = chunk_size * sc->stripes;
+ limits->chunk_sectors = sc->chunk_size;
+
+ if (!check_shl_overflow(sc->chunk_size, SECTOR_SHIFT, &io_min) &&
+ !check_mul_overflow(io_min, sc->stripes, &io_opt)) {
+ limits->io_min = io_min;
+ limits->io_opt = io_opt;
+ }
}
static struct target_type stripe_target = {
.name = "striped",
- .version = {1, 6, 0},
- .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT,
+ .version = {1, 7, 0},
+ .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
+ DM_TARGET_ATOMIC_WRITES | DM_TARGET_PASSES_CRYPTO,
.module = THIS_MODULE,
.ctr = stripe_ctr,
.dtr = stripe_dtr,
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index dfd9fb52a6f3..50a52ca50b34 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -114,8 +114,8 @@ static int alloc_region_table(struct dm_target *ti, unsigned int nr_paths)
return -EINVAL;
}
- sctx->region_table = vmalloc(array_size(nr_slots,
- sizeof(region_table_slot_t)));
+ sctx->region_table = vmalloc_array(nr_slots,
+ sizeof(region_table_slot_t));
if (!sctx->region_table) {
ti->error = "Cannot allocate region table";
return -ENOMEM;
@@ -517,7 +517,9 @@ static void switch_status(struct dm_target *ti, status_type_t type,
*
* Passthrough all ioctls to the path for sector 0
*/
-static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct switch_ctx *sctx = ti->private;
unsigned int path_nr;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index bd8b796ae683..ad0a60a07b93 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -117,7 +117,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
n_targets = (struct dm_target *) (n_highs + num);
memset(n_highs, -1, sizeof(*n_highs) * num);
- kvfree(t->highs);
t->num_allocated = num;
t->highs = n_highs;
@@ -257,7 +256,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
if (bdev_is_zoned(bdev)) {
unsigned int zone_sectors = bdev_zone_sectors(bdev);
- if (start & (zone_sectors - 1)) {
+ if (!bdev_is_zone_aligned(bdev, start)) {
DMERR("%s: start=%llu not aligned to h/w zone size %u of %pg",
dm_device_name(ti->table->md),
(unsigned long long)start,
@@ -274,7 +273,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
* devices do not end up with a smaller zone in the middle of
* the sector range.
*/
- if (len & (zone_sectors - 1)) {
+ if (!bdev_is_zone_aligned(bdev, len)) {
DMERR("%s: len=%llu not aligned to h/w zone size %u of %pg",
dm_device_name(ti->table->md),
(unsigned long long)len,
@@ -431,6 +430,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
return 0;
}
+ mutex_lock(&q->limits_lock);
+ /*
+ * BLK_FEAT_ATOMIC_WRITES is not inherited from the bottom device in
+ * blk_stack_limits(), so do it manually.
+ */
+ limits->features |= (q->limits.features & BLK_FEAT_ATOMIC_WRITES);
+
if (blk_stack_limits(limits, &q->limits,
get_start_sect(bdev) + start) < 0)
DMWARN("%s: adding target device %pg caused an alignment inconsistency: "
@@ -448,6 +454,7 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
*/
if (!dm_target_has_integrity(ti->type))
queue_limits_stack_integrity_bdev(limits, bdev);
+ mutex_unlock(&q->limits_lock);
return 0;
}
@@ -523,8 +530,9 @@ static char **realloc_argv(unsigned int *size, char **old_argv)
gfp = GFP_NOIO;
}
argv = kmalloc_array(new_size, sizeof(*argv), gfp);
- if (argv && old_argv) {
- memcpy(argv, old_argv, *size * sizeof(*argv));
+ if (argv) {
+ if (old_argv)
+ memcpy(argv, old_argv, *size * sizeof(*argv));
*size = new_size;
}
@@ -697,6 +705,10 @@ int dm_table_add_target(struct dm_table *t, const char *type,
DMERR("%s: zero-length target", dm_device_name(t->md));
return -EINVAL;
}
+ if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) {
+ DMERR("%s: too large device", dm_device_name(t->md));
+ return -EINVAL;
+ }
ti->type = dm_get_target_type(type);
if (!ti->type) {
@@ -887,17 +899,17 @@ static bool dm_table_supports_dax(struct dm_table *t,
return true;
}
-static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+static int device_is_not_rq_stackable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
struct block_device *bdev = dev->bdev;
struct request_queue *q = bdev_get_queue(bdev);
/* request-based cannot stack on partitions! */
if (bdev_is_partition(bdev))
- return false;
+ return true;
- return queue_is_mq(q);
+ return !queue_is_mq(q);
}
static int dm_table_determine_type(struct dm_table *t)
@@ -993,7 +1005,7 @@ verify_rq_based:
/* Non-request-stackable devices can't be used for request-based dm */
if (!ti->type->iterate_devices ||
- !ti->type->iterate_devices(ti, device_is_rq_stackable, NULL)) {
+ ti->type->iterate_devices(ti, device_is_not_rq_stackable, NULL)) {
DMERR("table load rejected: including non-request-stackable devices");
return -EINVAL;
}
@@ -1045,7 +1057,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
unsigned int min_pool_size = 0, pool_size;
struct dm_md_mempools *pools;
unsigned int bioset_flags = 0;
- bool mempool_needs_integrity = t->integrity_supported;
if (unlikely(type == DM_TYPE_NONE)) {
DMERR("no table type is set, can't allocate mempools");
@@ -1070,8 +1081,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
per_io_data_size = max(per_io_data_size, ti->per_io_data_size);
min_pool_size = max(min_pool_size, ti->num_flush_bios);
-
- mempool_needs_integrity |= ti->mempool_needs_integrity;
}
pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
front_pad = roundup(per_io_data_size,
@@ -1081,15 +1090,9 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
__alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags))
goto out_free_pools;
- if (mempool_needs_integrity &&
- bioset_integrity_create(&pools->io_bs, pool_size))
- goto out_free_pools;
init_bs:
if (bioset_init(&pools->bs, pool_size, front_pad, 0))
goto out_free_pools;
- if (mempool_needs_integrity &&
- bioset_integrity_create(&pools->bs, pool_size))
- goto out_free_pools;
t->mempools = pools;
return 0;
@@ -1177,7 +1180,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile,
t = dm_get_live_table(md, &srcu_idx);
if (!t)
- return 0;
+ goto put_live_table;
for (unsigned int i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
@@ -1188,10 +1191,181 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile,
(void *)key);
}
+put_live_table:
dm_put_live_table(md, srcu_idx);
return 0;
}
+enum dm_wrappedkey_op {
+ DERIVE_SW_SECRET,
+ IMPORT_KEY,
+ GENERATE_KEY,
+ PREPARE_KEY,
+};
+
+struct dm_wrappedkey_op_args {
+ enum dm_wrappedkey_op op;
+ int err;
+ union {
+ struct {
+ const u8 *eph_key;
+ size_t eph_key_size;
+ u8 *sw_secret;
+ } derive_sw_secret;
+ struct {
+ const u8 *raw_key;
+ size_t raw_key_size;
+ u8 *lt_key;
+ } import_key;
+ struct {
+ u8 *lt_key;
+ } generate_key;
+ struct {
+ const u8 *lt_key;
+ size_t lt_key_size;
+ u8 *eph_key;
+ } prepare_key;
+ };
+};
+
+static int dm_wrappedkey_op_callback(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct dm_wrappedkey_op_args *args = data;
+ struct block_device *bdev = dev->bdev;
+ struct blk_crypto_profile *profile =
+ bdev_get_queue(bdev)->crypto_profile;
+ int err = -EOPNOTSUPP;
+
+ if (!args->err)
+ return 0;
+
+ switch (args->op) {
+ case DERIVE_SW_SECRET:
+ err = blk_crypto_derive_sw_secret(
+ bdev,
+ args->derive_sw_secret.eph_key,
+ args->derive_sw_secret.eph_key_size,
+ args->derive_sw_secret.sw_secret);
+ break;
+ case IMPORT_KEY:
+ err = blk_crypto_import_key(profile,
+ args->import_key.raw_key,
+ args->import_key.raw_key_size,
+ args->import_key.lt_key);
+ break;
+ case GENERATE_KEY:
+ err = blk_crypto_generate_key(profile,
+ args->generate_key.lt_key);
+ break;
+ case PREPARE_KEY:
+ err = blk_crypto_prepare_key(profile,
+ args->prepare_key.lt_key,
+ args->prepare_key.lt_key_size,
+ args->prepare_key.eph_key);
+ break;
+ }
+ args->err = err;
+
+ /* Try another device in case this fails. */
+ return 0;
+}
+
+static int dm_exec_wrappedkey_op(struct blk_crypto_profile *profile,
+ struct dm_wrappedkey_op_args *args)
+{
+ struct mapped_device *md =
+ container_of(profile, struct dm_crypto_profile, profile)->md;
+ struct dm_target *ti;
+ struct dm_table *t;
+ int srcu_idx;
+ int i;
+
+ args->err = -EOPNOTSUPP;
+
+ t = dm_get_live_table(md, &srcu_idx);
+ if (!t)
+ goto out;
+
+ /*
+ * blk-crypto currently has no support for multiple incompatible
+ * implementations of wrapped inline crypto keys on a single system.
+ * It was already checked earlier that support for wrapped keys was
+ * declared on all underlying devices. Thus, all the underlying devices
+ * should support all wrapped key operations and they should behave
+ * identically, i.e. work with the same keys. So, just executing the
+ * operation on the first device on which it works suffices for now.
+ */
+ for (i = 0; i < t->num_targets; i++) {
+ ti = dm_table_get_target(t, i);
+ if (!ti->type->iterate_devices)
+ continue;
+ ti->type->iterate_devices(ti, dm_wrappedkey_op_callback, args);
+ if (!args->err)
+ break;
+ }
+out:
+ dm_put_live_table(md, srcu_idx);
+ return args->err;
+}
+
+static int dm_derive_sw_secret(struct blk_crypto_profile *profile,
+ const u8 *eph_key, size_t eph_key_size,
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
+{
+ struct dm_wrappedkey_op_args args = {
+ .op = DERIVE_SW_SECRET,
+ .derive_sw_secret = {
+ .eph_key = eph_key,
+ .eph_key_size = eph_key_size,
+ .sw_secret = sw_secret,
+ },
+ };
+ return dm_exec_wrappedkey_op(profile, &args);
+}
+
+static int dm_import_key(struct blk_crypto_profile *profile,
+ const u8 *raw_key, size_t raw_key_size,
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ struct dm_wrappedkey_op_args args = {
+ .op = IMPORT_KEY,
+ .import_key = {
+ .raw_key = raw_key,
+ .raw_key_size = raw_key_size,
+ .lt_key = lt_key,
+ },
+ };
+ return dm_exec_wrappedkey_op(profile, &args);
+}
+
+static int dm_generate_key(struct blk_crypto_profile *profile,
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ struct dm_wrappedkey_op_args args = {
+ .op = GENERATE_KEY,
+ .generate_key = {
+ .lt_key = lt_key,
+ },
+ };
+ return dm_exec_wrappedkey_op(profile, &args);
+}
+
+static int dm_prepare_key(struct blk_crypto_profile *profile,
+ const u8 *lt_key, size_t lt_key_size,
+ u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ struct dm_wrappedkey_op_args args = {
+ .op = PREPARE_KEY,
+ .prepare_key = {
+ .lt_key = lt_key,
+ .lt_key_size = lt_key_size,
+ .eph_key = eph_key,
+ },
+ };
+ return dm_exec_wrappedkey_op(profile, &args);
+}
+
static int
device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
@@ -1250,6 +1424,7 @@ static int dm_table_construct_crypto_profile(struct dm_table *t)
profile->max_dun_bytes_supported = UINT_MAX;
memset(profile->modes_supported, 0xFF,
sizeof(profile->modes_supported));
+ profile->key_types_supported = ~0;
for (i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
@@ -1265,6 +1440,13 @@ static int dm_table_construct_crypto_profile(struct dm_table *t)
profile);
}
+ if (profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED) {
+ profile->ll_ops.derive_sw_secret = dm_derive_sw_secret;
+ profile->ll_ops.import_key = dm_import_key;
+ profile->ll_ops.generate_key = dm_generate_key;
+ profile->ll_ops.prepare_key = dm_prepare_key;
+ }
+
if (t->md->queue &&
!blk_crypto_has_capabilities(profile,
t->md->queue->crypto_profile)) {
@@ -1492,6 +1674,18 @@ bool dm_table_has_no_data_devices(struct dm_table *t)
return true;
}
+bool dm_table_is_wildcard(struct dm_table *t)
+{
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (!dm_target_is_wildcard(ti->type))
+ return false;
+ }
+
+ return true;
+}
+
static int device_not_zoned(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1723,8 +1917,12 @@ static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *
sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
+ int b;
- return !q->limits.max_write_zeroes_sectors;
+ mutex_lock(&q->limits_lock);
+ b = !q->limits.max_write_zeroes_sectors;
+ mutex_unlock(&q->limits_lock);
+ return b;
}
static bool dm_table_supports_write_zeroes(struct dm_table *t)
@@ -1806,10 +2004,50 @@ static bool dm_table_supports_secure_erase(struct dm_table *t)
return true;
}
+static int device_not_atomic_write_capable(struct dm_target *ti,
+ struct dm_dev *dev, sector_t start,
+ sector_t len, void *data)
+{
+ return !bdev_can_atomic_write(dev->bdev);
+}
+
+static bool dm_table_supports_atomic_writes(struct dm_table *t)
+{
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (!dm_target_supports_atomic_writes(ti->type))
+ return false;
+
+ if (!ti->type->iterate_devices)
+ return false;
+
+ if (ti->type->iterate_devices(ti,
+ device_not_atomic_write_capable, NULL)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size,
+ sector_t new_size)
+{
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && dm_has_zone_plugs(t->md) &&
+ old_size != new_size) {
+ DMWARN("%s: device has zone write plug resources. "
+ "Cannot change size",
+ dm_device_name(t->md));
+ return false;
+ }
+ return true;
+}
+
int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
int r;
+ struct queue_limits old_limits;
if (!dm_table_supports_nowait(t))
limits->features &= ~BLK_FEAT_NOWAIT;
@@ -1827,8 +2065,10 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
limits->discard_alignment = 0;
}
- if (!dm_table_supports_write_zeroes(t))
+ if (!dm_table_supports_write_zeroes(t)) {
limits->max_write_zeroes_sectors = 0;
+ limits->max_hw_wzeroes_unmap_sectors = 0;
+ }
if (!dm_table_supports_secure_erase(t))
limits->max_secure_erase_sectors = 0;
@@ -1836,25 +2076,30 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (dm_table_supports_flush(t))
limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
- if (dm_table_supports_dax(t, device_not_dax_capable)) {
+ if (dm_table_supports_dax(t, device_not_dax_capable))
limits->features |= BLK_FEAT_DAX;
- if (dm_table_supports_dax(t, device_not_dax_synchronous_capable))
- set_dax_synchronous(t->md->dax_dev);
- } else
+ else
limits->features &= ~BLK_FEAT_DAX;
- if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL))
- dax_write_cache(t->md->dax_dev, true);
-
/* For a zoned table, setup the zone related queue attributes. */
- if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- (limits->features & BLK_FEAT_ZONED)) {
- r = dm_set_zones_restrictions(t, q, limits);
- if (r)
- return r;
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+ if (limits->features & BLK_FEAT_ZONED) {
+ r = dm_set_zones_restrictions(t, q, limits);
+ if (r)
+ return r;
+ } else if (dm_has_zone_plugs(t->md)) {
+ DMWARN("%s: device has zone write plug resources. "
+ "Cannot switch to non-zoned table.",
+ dm_device_name(t->md));
+ return -EINVAL;
+ }
}
- r = queue_limits_set(q, limits);
+ if (dm_table_supports_atomic_writes(t))
+ limits->features |= BLK_FEAT_ATOMIC_WRITES;
+
+ old_limits = queue_limits_start_update(q);
+ r = queue_limits_commit_update(q, limits);
if (r)
return r;
@@ -1865,10 +2110,21 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
(limits->features & BLK_FEAT_ZONED)) {
r = dm_revalidate_zones(t, q);
- if (r)
+ if (r) {
+ queue_limits_set(q, &old_limits);
return r;
+ }
}
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+ dm_finalize_zone_settings(t, limits);
+
+ if (dm_table_supports_dax(t, device_not_dax_synchronous_capable))
+ set_dax_synchronous(t->md->dax_dev);
+
+ if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL))
+ dax_write_cache(t->md->dax_dev, true);
+
dm_update_crypto_profile(q, t);
return 0;
}
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 652627aea11b..8fede41adec0 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -255,7 +255,7 @@ static void io_err_io_hints(struct dm_target *ti, struct queue_limits *limits)
static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, enum dax_access_mode mode, void **kaddr,
- pfn_t *pfn)
+ unsigned long *pfn)
{
return -EIO;
}
@@ -263,7 +263,8 @@ static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
static struct target_type error_target = {
.name = "error",
.version = {1, 7, 0},
- .features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM,
+ .features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM |
+ DM_TARGET_PASSES_INTEGRITY,
.ctr = io_err_ctr,
.dtr = io_err_dtr,
.map = io_err_map,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index bf0f9dddd146..c84149ba4e38 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2332,10 +2332,9 @@ static struct thin_c *get_first_thin(struct pool *pool)
struct thin_c *tc = NULL;
rcu_read_lock();
- if (!list_empty(&pool->active_thins)) {
- tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
+ tc = list_first_or_null_rcu(&pool->active_thins, struct thin_c, list);
+ if (tc)
thin_get(tc);
- }
rcu_read_unlock();
return tc;
@@ -3032,8 +3031,8 @@ static struct pool *pool_create(struct mapped_device *pool_md,
}
pool->cell_sort_array =
- vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
- sizeof(*pool->cell_sort_array)));
+ vmalloc_array(CELL_SORT_ARRAY_SIZE,
+ sizeof(*pool->cell_sort_array));
if (!pool->cell_sort_array) {
*error = "Error allocating cell sort array";
err_p = ERR_PTR(-ENOMEM);
@@ -4112,8 +4111,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
- DM_TARGET_IMMUTABLE,
- .version = {1, 23, 0},
+ DM_TARGET_IMMUTABLE | DM_TARGET_PASSES_CRYPTO,
+ .version = {1, 24, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -4498,7 +4497,8 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 23, 0},
+ .features = DM_TARGET_PASSES_CRYPTO,
+ .version = {1, 24, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index 89cb7942ec5c..baf683cabb1b 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -451,7 +451,7 @@ static struct page_info * __must_check find_page(struct vdo_page_cache *cache,
* select_lru_page() - Determine which page is least recently used.
*
* Picks the least recently used from among the non-busy entries at the front of each of the lru
- * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely
+ * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely
* that the entries at the front are busy unless the queue is very short, but not impossible.
*
* Return: A pointer to the info structure for a relevant page, or NULL if no such page can be
@@ -1544,7 +1544,7 @@ static void write_page_if_not_dirtied(struct vdo_waiter *waiter, void *context)
static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio)
{
- return_vio_to_pool(zone->vio_pool, vio);
+ return_vio_to_pool(vio);
check_for_drain_complete(zone);
}
@@ -1837,7 +1837,7 @@ static void finish_block_map_page_load(struct vdo_completion *completion)
if (!vdo_copy_valid_page(vio->data, nonce, pbn, page))
vdo_format_block_map_page(page, nonce, pbn, false);
- return_vio_to_pool(zone->vio_pool, pooled);
+ return_vio_to_pool(pooled);
/* Release our claim to the load and wake any waiters */
release_page_lock(data_vio, "load");
@@ -1851,10 +1851,9 @@ static void handle_io_error(struct vdo_completion *completion)
struct vio *vio = as_vio(completion);
struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
struct data_vio *data_vio = completion->parent;
- struct block_map_zone *zone = pooled->context;
vio_record_metadata_io_error(vio);
- return_vio_to_pool(zone->vio_pool, pooled);
+ return_vio_to_pool(pooled);
abort_load(data_vio, result);
}
@@ -2499,7 +2498,7 @@ static void finish_cursor(struct cursor *cursor)
struct cursors *cursors = cursor->parent;
struct vdo_completion *completion = cursors->completion;
- return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio));
+ return_vio_to_pool(vdo_forget(cursor->vio));
if (--cursors->active_roots > 0)
return;
@@ -2746,7 +2745,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map,
if (result != VDO_SUCCESS)
return result;
- result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE,
+ result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, 1,
zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR,
VIO_PRIORITY_METADATA, zone, &zone->vio_pool);
if (result != VDO_SUCCESS)
diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h
index a8c4d6e24b38..2a8b03779f87 100644
--- a/drivers/md/dm-vdo/constants.h
+++ b/drivers/md/dm-vdo/constants.h
@@ -44,9 +44,6 @@ enum {
/* The default size of each slab journal, in blocks */
DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224,
- /* Unit test minimum */
- MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2,
-
/*
* The initial size of lbn_operations and pbn_operations, which is based upon the expected
* maximum number of outstanding VIOs. This value was chosen to make it highly unlikely
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index 810002747091..262e11581f2d 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -17,6 +17,7 @@
#include <linux/minmax.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
+#include <linux/string.h>
#include <linux/wait.h>
#include "logger.h"
@@ -509,18 +510,6 @@ static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lb
vdo_enqueue_completion(completion, VDO_DEFAULT_Q_MAP_BIO_PRIORITY);
}
-static bool is_zero_block(char *block)
-{
- int i;
-
- for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64)) {
- if (*((u64 *) &block[i]))
- return false;
- }
-
- return true;
-}
-
static void copy_from_bio(struct bio *bio, char *data_ptr)
{
struct bio_vec biovec;
@@ -572,7 +561,7 @@ static void launch_bio(struct vdo *vdo, struct data_vio *data_vio, struct bio *b
* we acknowledge the bio.
*/
copy_from_bio(bio, data_vio->vio.data);
- data_vio->is_zero = is_zero_block(data_vio->vio.data);
+ data_vio->is_zero = mem_is_zero(data_vio->vio.data, VDO_BLOCK_SIZE);
data_vio->write = true;
}
@@ -1459,7 +1448,7 @@ static void modify_for_partial_write(struct vdo_completion *completion)
copy_from_bio(bio, data + data_vio->offset);
}
- data_vio->is_zero = is_zero_block(data);
+ data_vio->is_zero = mem_is_zero(data, VDO_BLOCK_SIZE);
data_vio->read = false;
launch_data_vio_logical_callback(data_vio,
continue_data_vio_with_block_map_slot);
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index b6f8e2dc7729..4d983092a152 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -226,7 +226,7 @@ struct hash_lock {
* A list containing the data VIOs sharing this lock, all having the same record name and
* data block contents, linked by their hash_lock_node fields.
*/
- struct list_head duplicate_ring;
+ struct list_head duplicate_vios;
/* The number of data_vios sharing this lock instance */
data_vio_count_t reference_count;
@@ -343,7 +343,7 @@ static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *l
{
memset(lock, 0, sizeof(*lock));
INIT_LIST_HEAD(&lock->pool_node);
- INIT_LIST_HEAD(&lock->duplicate_ring);
+ INIT_LIST_HEAD(&lock->duplicate_vios);
vdo_waitq_init(&lock->waiters);
list_add_tail(&lock->pool_node, &zone->lock_pool);
}
@@ -441,7 +441,7 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL,
"must have a hash zone when holding a hash lock");
VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry),
- "must be on a hash lock ring when holding a hash lock");
+ "must be on a hash lock list when holding a hash lock");
VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0,
"hash lock reference must be counted");
@@ -464,10 +464,10 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
if (new_lock != NULL) {
/*
- * Keep all data_vios sharing the lock on a ring since they can complete in any
+ * Keep all data_vios sharing the lock on a list since they can complete in any
* order and we'll always need a pointer to one to compare data.
*/
- list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring);
+ list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_vios);
new_lock->reference_count += 1;
if (new_lock->max_references < new_lock->reference_count)
new_lock->max_references = new_lock->reference_count;
@@ -1789,10 +1789,10 @@ static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate
struct hash_zone *zone;
bool collides;
- if (list_empty(&lock->duplicate_ring))
+ if (list_empty(&lock->duplicate_vios))
return false;
- lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio,
+ lock_holder = list_first_entry(&lock->duplicate_vios, struct data_vio,
hash_lock_entry);
zone = candidate->hash_zone;
collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data);
@@ -1815,7 +1815,7 @@ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio
return result;
result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry),
- "must not already be a member of a hash lock ring");
+ "must not already be a member of a hash lock list");
if (result != VDO_SUCCESS)
return result;
@@ -1942,8 +1942,8 @@ void vdo_release_hash_lock(struct data_vio *data_vio)
"returned hash lock must not be in use with state %s",
get_hash_lock_state_name(lock->state));
VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node),
- "hash lock returned to zone must not be in a pool ring");
- VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring),
+ "hash lock returned to zone must not be in a pool list");
+ VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_vios),
"hash lock returned to zone must not reference DataVIOs");
return_hash_lock_to_pool(zone, lock);
@@ -2178,6 +2178,7 @@ static int initialize_index(struct vdo *vdo, struct hash_zones *zones)
vdo_set_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval);
vdo_set_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval);
+ spin_lock_init(&zones->lock);
/*
* Since we will save up the timeouts that would have been reported but were ratelimited,
@@ -2260,7 +2261,7 @@ static void check_for_drain_complete(struct hash_zone *zone)
if ((atomic_read(&zone->timer_state) == DEDUPE_QUERY_TIMER_IDLE) ||
change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING,
DEDUPE_QUERY_TIMER_IDLE)) {
- del_timer_sync(&zone->timer);
+ timer_delete_sync(&zone->timer);
} else {
/*
* There is an in flight time-out, which must get processed before we can continue.
@@ -2336,7 +2337,7 @@ static void timeout_index_operations_callback(struct vdo_completion *completion)
static void timeout_index_operations(struct timer_list *t)
{
- struct hash_zone *zone = from_timer(zone, t, timer);
+ struct hash_zone *zone = timer_container_of(zone, t, timer);
if (change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING,
DEDUPE_QUERY_TIMER_FIRED))
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
index 100e92f8f866..b7cc0f41caca 100644
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@@ -711,24 +711,11 @@ int vdo_configure_slab(block_count_t slab_size, block_count_t slab_journal_block
ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks);
meta_blocks = (ref_blocks + slab_journal_blocks);
- /* Make sure test code hasn't configured slabs to be too small. */
+ /* Make sure configured slabs are not too small. */
if (meta_blocks >= slab_size)
return VDO_BAD_CONFIGURATION;
- /*
- * If the slab size is very small, assume this must be a unit test and override the number
- * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their
- * data_blocks fields to be the exact capacity of the configured volume, and that used to
- * fall out since they use a power of two for the number of data blocks, the slab size was
- * a power of two, and every block in a slab was a data block.
- *
- * TODO: Try to figure out some way of structuring testParameters and unit tests so this
- * hack isn't needed without having to edit several unit tests every time the metadata size
- * changes by one block.
- */
data_blocks = slab_size - meta_blocks;
- if ((slab_size < 1024) && !is_power_of_2(data_blocks))
- data_blocks = ((block_count_t) 1 << ilog2(data_blocks));
/*
* Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in
@@ -1221,11 +1208,6 @@ int vdo_validate_config(const struct vdo_config *config,
if (result != VDO_SUCCESS)
return result;
- result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS,
- "slab journal size meets minimum size");
- if (result != VDO_SUCCESS)
- return result;
-
result = VDO_ASSERT(config->slab_journal_blocks <= config->slab_size,
"slab journal size is within expected bound");
if (result != VDO_SUCCESS)
diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c
index ae11941c90a9..0613c82bbe8e 100644
--- a/drivers/md/dm-vdo/funnel-workqueue.c
+++ b/drivers/md/dm-vdo/funnel-workqueue.c
@@ -252,8 +252,7 @@ static void service_work_queue(struct simple_work_queue *queue)
* This speeds up some performance tests; that "other work" might include other VDO
* threads.
*/
- if (need_resched())
- cond_resched();
+ cond_resched();
}
run_finish_hook(queue);
diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
index af8fab83b0f3..61edf2b72427 100644
--- a/drivers/md/dm-vdo/indexer/index-layout.c
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@@ -54,7 +54,6 @@
* Each save also has a unique nonce.
*/
-#define MAGIC_SIZE 32
#define NONCE_INFO_SIZE 32
#define MAX_SAVES 2
@@ -98,9 +97,11 @@ enum region_type {
#define SUPER_VERSION_CURRENT 3
#define SUPER_VERSION_MAXIMUM 7
-static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
+static const u8 LAYOUT_MAGIC[] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */
+#define MAGIC_SIZE (sizeof(LAYOUT_MAGIC) - 1)
+
struct region_header {
u64 magic;
u64 region_blocks;
diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
index aee0914d604a..aa575a24e0b2 100644
--- a/drivers/md/dm-vdo/indexer/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -100,7 +100,6 @@ static int get_index_session(struct uds_index_session *index_session)
int uds_launch_request(struct uds_request *request)
{
- size_t internal_size;
int result;
if (request->callback == NULL) {
@@ -121,10 +120,7 @@ int uds_launch_request(struct uds_request *request)
}
/* Reset all internal fields before processing. */
- internal_size =
- sizeof(struct uds_request) - offsetof(struct uds_request, zone_number);
- // FIXME should be using struct_group for this instead
- memset((char *) request + sizeof(*request) - internal_size, 0, internal_size);
+ memset(&request->internal, 0, sizeof(request->internal));
result = get_index_session(request->session);
if (result != UDS_SUCCESS)
diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h
index 183a94eb7e92..7c1fc4577f5b 100644
--- a/drivers/md/dm-vdo/indexer/indexer.h
+++ b/drivers/md/dm-vdo/indexer/indexer.h
@@ -8,6 +8,7 @@
#include <linux/mutex.h>
#include <linux/sched.h>
+#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/wait.h>
@@ -73,7 +74,7 @@ enum uds_request_type {
/* Remove any mapping for a name. */
UDS_DELETE,
-};
+} __packed;
enum uds_open_index_type {
/* Create a new index. */
@@ -226,7 +227,7 @@ struct uds_zone_message {
enum uds_zone_message_type type;
/* The virtual chapter number to which the message applies */
u64 virtual_chapter;
-};
+} __packed;
struct uds_index_session;
struct uds_index;
@@ -253,34 +254,32 @@ struct uds_request {
/* The existing data associated with the request name, if any */
struct uds_record_data old_metadata;
- /* Either UDS_SUCCESS or an error code for the request */
- int status;
/* True if the record name had an existing entry in the index */
bool found;
+ /* Either UDS_SUCCESS or an error code for the request */
+ int status;
- /*
- * The remaining fields are used internally and should not be altered by clients. The index
- * relies on zone_number being the first field in this section.
- */
-
- /* The number of the zone which will process this request*/
- unsigned int zone_number;
- /* A link for adding a request to a lock-free queue */
- struct funnel_queue_entry queue_link;
- /* A link for adding a request to a standard linked list */
- struct uds_request *next_request;
- /* A pointer to the index processing this request */
- struct uds_index *index;
- /* Control message for coordinating between zones */
- struct uds_zone_message zone_message;
- /* If true, process request immediately by waking the worker thread */
- bool unbatched;
- /* If true, continue this request before processing newer requests */
- bool requeued;
- /* The virtual chapter containing the record name, if known */
- u64 virtual_chapter;
- /* The region of the index containing the record name */
- enum uds_index_region location;
+ /* The remaining fields are used internally and should not be altered by clients. */
+ struct_group(internal,
+ /* The virtual chapter containing the record name, if known */
+ u64 virtual_chapter;
+ /* The region of the index containing the record name */
+ enum uds_index_region location;
+ /* If true, process request immediately by waking the worker thread */
+ bool unbatched;
+ /* If true, continue this request before processing newer requests */
+ bool requeued;
+ /* Control message for coordinating between zones */
+ struct uds_zone_message zone_message;
+ /* The number of the zone which will process this request*/
+ unsigned int zone_number;
+ /* A link for adding a request to a lock-free queue */
+ struct funnel_queue_entry queue_link;
+ /* A link for adding a request to a standard linked list */
+ struct uds_request *next_request;
+ /* A pointer to the index processing this request */
+ struct uds_index *index;
+ );
};
/* A session is required for most index operations. */
diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
index 12f954a0c532..afb062e1f1fb 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.c
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -836,7 +836,7 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
"%zu bytes decoded of %zu expected", offset,
sizeof(buffer));
if (result != VDO_SUCCESS)
- result = UDS_CORRUPT_DATA;
+ return UDS_CORRUPT_DATA;
if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) {
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
@@ -928,7 +928,7 @@ static int start_restoring_volume_index(struct volume_index *volume_index,
"%zu bytes decoded of %zu expected", offset,
sizeof(buffer));
if (result != VDO_SUCCESS)
- result = UDS_CORRUPT_DATA;
+ return UDS_CORRUPT_DATA;
if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0)
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index 655453bb276b..425b3a74f4db 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -754,10 +754,11 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
u32 physical_page, struct cached_page **page_ptr)
{
struct cached_page *page;
+ unsigned int zone_number = request->zone_number;
get_page_from_cache(&volume->page_cache, physical_page, &page);
if (page != NULL) {
- if (request->zone_number == 0) {
+ if (zone_number == 0) {
/* Only one zone is allowed to update the LRU. */
make_page_most_recent(&volume->page_cache, page);
}
@@ -767,7 +768,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
}
/* Prepare to enqueue a read for the page. */
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
mutex_lock(&volume->read_threads_mutex);
/*
@@ -787,8 +788,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
* the order does not matter for correctness as it does below.
*/
mutex_unlock(&volume->read_threads_mutex);
- begin_pending_search(&volume->page_cache, physical_page,
- request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
return UDS_QUEUED;
}
@@ -797,7 +797,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
* "search pending" state in careful order so no other thread can mess with the data before
* the caller gets to look at it.
*/
- begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
mutex_unlock(&volume->read_threads_mutex);
*page_ptr = page;
return UDS_SUCCESS;
@@ -849,6 +849,7 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r
{
int result;
struct cached_page *page = NULL;
+ unsigned int zone_number = request->zone_number;
u32 physical_page = map_to_physical_page(volume->geometry, chapter,
index_page_number);
@@ -858,18 +859,18 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r
* invalidation by the reader thread, before the reader thread has noticed that the
* invalidate_counter has been incremented.
*/
- begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
result = get_volume_page_protected(volume, request, physical_page, &page);
if (result != UDS_SUCCESS) {
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return result;
}
result = uds_search_chapter_index_page(&page->index_page, volume->geometry,
&request->record_name,
record_page_number);
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return result;
}
@@ -882,6 +883,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req
{
struct cached_page *record_page;
struct index_geometry *geometry = volume->geometry;
+ unsigned int zone_number = request->zone_number;
int result;
u32 physical_page, page_number;
@@ -905,11 +907,11 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req
* invalidation by the reader thread, before the reader thread has noticed that the
* invalidate_counter has been incremented.
*/
- begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
result = get_volume_page_protected(volume, request, physical_page, &record_page);
if (result != UDS_SUCCESS) {
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return result;
}
@@ -917,7 +919,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req
&request->record_name, geometry, &request->old_metadata))
*found = true;
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return UDS_SUCCESS;
}
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
index 421e5436c32c..11d47770b54d 100644
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -327,6 +327,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio)
* @error_handler: the handler for submission or I/O errors (may be NULL)
* @operation: the type of I/O to perform
* @data: the buffer to read or write (may be NULL)
+ * @size: the I/O amount in bytes
*
* The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block
* other vdo threads.
@@ -338,7 +339,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio)
*/
void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
bio_end_io_t callback, vdo_action_fn error_handler,
- blk_opf_t operation, char *data)
+ blk_opf_t operation, char *data, int size)
{
int result;
struct vdo_completion *completion = &vio->completion;
@@ -349,7 +350,8 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
vdo_reset_completion(completion);
completion->error_handler = error_handler;
- result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical);
+ result = vio_reset_bio_with_size(vio, data, size, callback, operation | REQ_META,
+ physical);
if (result != VDO_SUCCESS) {
continue_vio(vio, result);
return;
diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h
index 80748699496f..3088f11055fd 100644
--- a/drivers/md/dm-vdo/io-submitter.h
+++ b/drivers/md/dm-vdo/io-submitter.h
@@ -8,6 +8,7 @@
#include <linux/bio.h>
+#include "constants.h"
#include "types.h"
struct io_submitter;
@@ -26,14 +27,25 @@ void vdo_submit_data_vio(struct data_vio *data_vio);
void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
bio_end_io_t callback, vdo_action_fn error_handler,
- blk_opf_t operation, char *data);
+ blk_opf_t operation, char *data, int size);
static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
bio_end_io_t callback, vdo_action_fn error_handler,
blk_opf_t operation)
{
__submit_metadata_vio(vio, physical, callback, error_handler,
- operation, vio->data);
+ operation, vio->data, vio->block_count * VDO_BLOCK_SIZE);
+}
+
+static inline void vdo_submit_metadata_vio_with_size(struct vio *vio,
+ physical_block_number_t physical,
+ bio_end_io_t callback,
+ vdo_action_fn error_handler,
+ blk_opf_t operation,
+ int size)
+{
+ __submit_metadata_vio(vio, physical, callback, error_handler,
+ operation, vio->data, size);
}
static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback,
@@ -41,7 +53,7 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback,
{
/* FIXME: Can we just use REQ_OP_FLUSH? */
__submit_metadata_vio(vio, 0, callback, error_handler,
- REQ_OP_WRITE | REQ_PREFLUSH, NULL);
+ REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0);
}
#endif /* VDO_IO_SUBMITTER_H */
diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c
index 3f7dc2cb6b98..76a987ccf926 100644
--- a/drivers/md/dm-vdo/logger.c
+++ b/drivers/md/dm-vdo/logger.c
@@ -34,7 +34,7 @@ static const char *get_current_interrupt_type(void)
if (in_nmi())
return "NMI";
- if (in_irq())
+ if (in_hardirq())
return "HI";
if (in_softirq())
diff --git a/drivers/md/dm-vdo/packer.h b/drivers/md/dm-vdo/packer.h
index 0f3be44710b5..8c8d6892582d 100644
--- a/drivers/md/dm-vdo/packer.h
+++ b/drivers/md/dm-vdo/packer.h
@@ -46,7 +46,7 @@ struct compressed_block {
/*
* Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed
- * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with
+ * block. The bins are kept in a list sorted by the amount of unused space so the first bin with
* enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or
* is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin.
* Upon entering the packer, each data_vio already has its compressed data in the first slot of the
diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c
index 42d3d8d0e4b5..9bae8256ba4e 100644
--- a/drivers/md/dm-vdo/priority-table.c
+++ b/drivers/md/dm-vdo/priority-table.c
@@ -199,7 +199,7 @@ void vdo_priority_table_remove(struct priority_table *table, struct list_head *e
/*
* Remove the entry from the bucket list, remembering a pointer to another entry in the
- * ring.
+ * list.
*/
next_entry = entry->next;
list_del_init(entry);
diff --git a/drivers/md/dm-vdo/recovery-journal.h b/drivers/md/dm-vdo/recovery-journal.h
index 899071173015..25e7ec6d19f6 100644
--- a/drivers/md/dm-vdo/recovery-journal.h
+++ b/drivers/md/dm-vdo/recovery-journal.h
@@ -43,9 +43,9 @@
* has a vio which is used to commit that block to disk. The vio's data is the on-disk
* representation of the journal block. In addition each in-memory block has a buffer which is used
* to accumulate entries while a partial commit of the block is in progress. In-memory blocks are
- * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active
- * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is
- * moved back to the 'free_tail_blocks' ring.
+ * kept on two lists. Free blocks live on the 'free_tail_blocks' list. When a block becomes active
+ * (see below) it is moved to the 'active_tail_blocks' list. When a block is fully committed, it is
+ * moved back to the 'free_tail_blocks' list.
*
* When entries are added to the journal, they are added to the active in-memory block, as
* indicated by the 'active_block' field. If the caller wishes to wait for the entry to be
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index 8f0a35c63af6..f3d80ff7bef5 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -139,7 +139,7 @@ static bool is_slab_journal_blank(const struct vdo_slab *slab)
}
/**
- * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct
+ * mark_slab_journal_dirty() - Put a slab journal on the dirty list of its allocator in the correct
* order.
* @journal: The journal to be marked dirty.
* @lock: The recovery journal lock held by the slab journal.
@@ -414,8 +414,7 @@ static void complete_reaping(struct vdo_completion *completion)
{
struct slab_journal *journal = completion->parent;
- return_vio_to_pool(journal->slab->allocator->vio_pool,
- vio_as_pooled_vio(as_vio(vdo_forget(completion))));
+ return_vio_to_pool(vio_as_pooled_vio(as_vio(completion)));
finish_reaping(journal);
reap_slab_journal(journal);
}
@@ -698,7 +697,7 @@ static void complete_write(struct vdo_completion *completion)
sequence_number_t committed = get_committing_sequence_number(pooled);
list_del_init(&pooled->list_entry);
- return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled));
+ return_vio_to_pool(pooled);
if (result != VDO_SUCCESS) {
vio_record_metadata_io_error(as_vio(completion));
@@ -822,7 +821,7 @@ static void commit_tail(struct slab_journal *journal)
/*
* Since we are about to commit the tail block, this journal no longer needs to be on the
- * ring of journals which the recovery journal might ask to commit.
+ * list of journals which the recovery journal might ask to commit.
*/
mark_slab_journal_clean(journal);
@@ -1076,7 +1075,7 @@ static void finish_reference_block_write(struct vdo_completion *completion)
/* Release the slab journal lock. */
adjust_slab_journal_block_reference(&slab->journal,
block->slab_journal_lock_to_release, -1);
- return_vio_to_pool(slab->allocator->vio_pool, pooled);
+ return_vio_to_pool(pooled);
/*
* We can't clear the is_writing flag earlier as releasing the slab journal lock may cause
@@ -1170,8 +1169,8 @@ static void handle_io_error(struct vdo_completion *completion)
struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab;
vio_record_metadata_io_error(vio);
- return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
- slab->active_count--;
+ return_vio_to_pool(vio_as_pooled_vio(vio));
+ slab->active_count -= vio->io_size / VDO_BLOCK_SIZE;
vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
check_if_slab_drained(slab);
}
@@ -1372,7 +1371,7 @@ static unsigned int calculate_slab_priority(struct vdo_slab *slab)
static void prioritize_slab(struct vdo_slab *slab)
{
VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
- "a slab must not already be on a ring when prioritizing");
+ "a slab must not already be on a list when prioritizing");
slab->priority = calculate_slab_priority(slab);
vdo_priority_table_enqueue(slab->allocator->prioritized_slabs,
slab->priority, &slab->allocq_entry);
@@ -2165,28 +2164,95 @@ static void dirty_all_reference_blocks(struct vdo_slab *slab)
dirty_block(&slab->reference_blocks[i]);
}
+static inline bool journal_points_equal(struct journal_point first,
+ struct journal_point second)
+{
+ return ((first.sequence_number == second.sequence_number) &&
+ (first.entry_count == second.entry_count));
+}
+
/**
- * clear_provisional_references() - Clear the provisional reference counts from a reference block.
- * @block: The block to clear.
+ * match_bytes() - Check an 8-byte word for bytes matching the value specified
+ * @input: A word to examine the bytes of
+ * @match: The byte value sought
+ *
+ * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise
*/
-static void clear_provisional_references(struct reference_block *block)
+static inline u64 match_bytes(u64 input, u8 match)
{
- vdo_refcount_t *counters = get_reference_counters_for_block(block);
- block_count_t j;
+ u64 temp = input ^ (match * 0x0101010101010101ULL);
+ /* top bit of each byte is set iff top bit of temp byte is clear; rest are 0 */
+ u64 test_top_bits = ~temp & 0x8080808080808080ULL;
+ /* top bit of each byte is set iff low 7 bits of temp byte are clear; rest are useless */
+ u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL);
+ /* return 1 when both tests indicate temp byte is 0 */
+ return (test_top_bits & test_low_bits) >> 7;
+}
+
+/**
+ * count_valid_references() - Process a newly loaded refcount array
+ * @counters: the array of counters from a metadata block
+ *
+ * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't
+ * cleaned up at shutdown, changing them internally to "empty".
+ *
+ * Return: the number of blocks that are referenced (counters not "empty")
+ */
+static unsigned int count_valid_references(vdo_refcount_t *counters)
+{
+ u64 *words = (u64 *)counters;
+ /* It's easier to count occurrences of a specific byte than its absences. */
+ unsigned int empty_count = 0;
+ /* For speed, we process 8 bytes at once. */
+ unsigned int words_left = COUNTS_PER_BLOCK / sizeof(u64);
+
+ /*
+ * Sanity check assumptions used for optimizing this code: Counters are bytes. The counter
+ * array is a multiple of the word size.
+ */
+ BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1);
+ BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0);
+
+ while (words_left > 0) {
+ /*
+ * This is used effectively as 8 byte-size counters. Byte 0 counts how many words
+ * had the target value found in byte 0, etc. We just have to avoid overflow.
+ */
+ u64 split_count = 0;
+ /*
+ * The counter "% 255" trick used below to fold split_count into empty_count
+ * imposes a limit of 254 bytes examined each iteration of the outer loop. We
+ * process a word at a time, so that limit gets rounded down to 31 u64 words.
+ */
+ const unsigned int max_words_per_iteration = 254 / sizeof(u64);
+ unsigned int iter_words_left = min_t(unsigned int, words_left,
+ max_words_per_iteration);
+
+ words_left -= iter_words_left;
+
+ while (iter_words_left--) {
+ u64 word = *words;
+ u64 temp;
+
+ /* First, if we have any provisional refcount values, clear them. */
+ temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT);
+ if (temp) {
+ /*
+ * 'temp' has 0x01 bytes where 'word' has PROVISIONAL; this xor
+ * will alter just those bytes, changing PROVISIONAL to EMPTY.
+ */
+ word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT);
+ *words = word;
+ }
- for (j = 0; j < COUNTS_PER_BLOCK; j++) {
- if (counters[j] == PROVISIONAL_REFERENCE_COUNT) {
- counters[j] = EMPTY_REFERENCE_COUNT;
- block->allocated_count--;
+ /* Now count the EMPTY_REFERENCE_COUNT bytes, updating the 8 counters. */
+ split_count += match_bytes(word, EMPTY_REFERENCE_COUNT);
+ words++;
}
+ empty_count += split_count % 255;
}
-}
-static inline bool journal_points_equal(struct journal_point first,
- struct journal_point second)
-{
- return ((first.sequence_number == second.sequence_number) &&
- (first.entry_count == second.entry_count));
+ return COUNTS_PER_BLOCK - empty_count;
}
/**
@@ -2197,7 +2263,6 @@ static inline bool journal_points_equal(struct journal_point first,
static void unpack_reference_block(struct packed_reference_block *packed,
struct reference_block *block)
{
- block_count_t index;
sector_count_t i;
struct vdo_slab *slab = block->slab;
vdo_refcount_t *counters = get_reference_counters_for_block(block);
@@ -2223,11 +2288,7 @@ static void unpack_reference_block(struct packed_reference_block *packed,
}
}
- block->allocated_count = 0;
- for (index = 0; index < COUNTS_PER_BLOCK; index++) {
- if (counters[index] != EMPTY_REFERENCE_COUNT)
- block->allocated_count++;
- }
+ block->allocated_count = count_valid_references(counters);
}
/**
@@ -2240,13 +2301,19 @@ static void finish_reference_block_load(struct vdo_completion *completion)
struct pooled_vio *pooled = vio_as_pooled_vio(vio);
struct reference_block *block = completion->parent;
struct vdo_slab *slab = block->slab;
+ unsigned int block_count = vio->io_size / VDO_BLOCK_SIZE;
+ unsigned int i;
+ char *data = vio->data;
- unpack_reference_block((struct packed_reference_block *) vio->data, block);
- return_vio_to_pool(slab->allocator->vio_pool, pooled);
- slab->active_count--;
- clear_provisional_references(block);
+ for (i = 0; i < block_count; i++, block++, data += VDO_BLOCK_SIZE) {
+ struct packed_reference_block *packed = (struct packed_reference_block *) data;
+
+ unpack_reference_block(packed, block);
+ slab->free_blocks -= block->allocated_count;
+ }
+ return_vio_to_pool(pooled);
+ slab->active_count -= block_count;
- slab->free_blocks -= block->allocated_count;
check_if_slab_drained(slab);
}
@@ -2260,23 +2327,25 @@ static void load_reference_block_endio(struct bio *bio)
}
/**
- * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the
- * block.
- * @waiter: The waiter of the block to load.
+ * load_reference_block_group() - After a block waiter has gotten a VIO from the VIO pool, load
+ * a set of blocks.
+ * @waiter: The waiter of the first block to load.
* @context: The VIO returned by the pool.
*/
-static void load_reference_block(struct vdo_waiter *waiter, void *context)
+static void load_reference_block_group(struct vdo_waiter *waiter, void *context)
{
struct pooled_vio *pooled = context;
struct vio *vio = &pooled->vio;
struct reference_block *block =
container_of(waiter, struct reference_block, waiter);
- size_t block_offset = (block - block->slab->reference_blocks);
+ u32 block_offset = block - block->slab->reference_blocks;
+ u32 max_block_count = block->slab->reference_block_count - block_offset;
+ u32 block_count = min_t(int, vio->block_count, max_block_count);
vio->completion.parent = block;
- vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset,
- load_reference_block_endio, handle_io_error,
- REQ_OP_READ);
+ vdo_submit_metadata_vio_with_size(vio, block->slab->ref_counts_origin + block_offset,
+ load_reference_block_endio, handle_io_error,
+ REQ_OP_READ, block_count * VDO_BLOCK_SIZE);
}
/**
@@ -2286,14 +2355,21 @@ static void load_reference_block(struct vdo_waiter *waiter, void *context)
static void load_reference_blocks(struct vdo_slab *slab)
{
block_count_t i;
+ u64 blocks_per_vio = slab->allocator->refcount_blocks_per_big_vio;
+ struct vio_pool *pool = slab->allocator->refcount_big_vio_pool;
+
+ if (!pool) {
+ pool = slab->allocator->vio_pool;
+ blocks_per_vio = 1;
+ }
slab->free_blocks = slab->block_count;
slab->active_count = slab->reference_block_count;
- for (i = 0; i < slab->reference_block_count; i++) {
+ for (i = 0; i < slab->reference_block_count; i += blocks_per_vio) {
struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter;
- waiter->callback = load_reference_block;
- acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
+ waiter->callback = load_reference_block_group;
+ acquire_vio_from_pool(pool, waiter);
}
}
@@ -2429,7 +2505,7 @@ static void finish_loading_journal(struct vdo_completion *completion)
initialize_journal_state(journal);
}
- return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+ return_vio_to_pool(vio_as_pooled_vio(vio));
vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab));
}
@@ -2449,7 +2525,7 @@ static void handle_load_error(struct vdo_completion *completion)
struct vio *vio = as_vio(completion);
vio_record_metadata_io_error(vio);
- return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+ return_vio_to_pool(vio_as_pooled_vio(vio));
vdo_finish_loading_with_result(&journal->slab->state, result);
}
@@ -2547,7 +2623,7 @@ static void queue_slab(struct vdo_slab *slab)
int result;
VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
- "a requeued slab must not already be on a ring");
+ "a requeued slab must not already be on a list");
if (vdo_is_read_only(allocator->depot->vdo))
return;
@@ -2700,6 +2776,7 @@ static void finish_scrubbing(struct slab_scrubber *scrubber, int result)
vdo_log_info("VDO commencing normal operation");
else if (prior_state == VDO_RECOVERING)
vdo_log_info("Exiting recovery mode");
+ free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool));
}
/*
@@ -3281,7 +3358,7 @@ int vdo_release_block_reference(struct block_allocator *allocator,
* This is a min_heap callback function orders slab_status structures using the 'is_clean' field as
* the primary key and the 'emptiness' field as the secondary key.
*
- * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping
+ * Slabs need to be pushed onto the lists in the same order they are to be popped off. Popping
* should always get the most empty first, so pushing should be from most empty to least empty.
* Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements
* before larger ones.
@@ -3983,6 +4060,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot,
struct vdo *vdo = depot->vdo;
block_count_t max_free_blocks = depot->slab_config.data_blocks;
unsigned int max_priority = (2 + ilog2(max_free_blocks));
+ u32 reference_block_count, refcount_reads_needed, refcount_blocks_per_vio;
*allocator = (struct block_allocator) {
.depot = depot,
@@ -4000,12 +4078,24 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot,
return result;
vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION);
- result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id,
+ result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, 1, allocator->thread_id,
VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
allocator, &allocator->vio_pool);
if (result != VDO_SUCCESS)
return result;
+ /* Initialize the refcount-reading vio pool. */
+ reference_block_count = vdo_get_saved_reference_count_size(depot->slab_config.slab_blocks);
+ refcount_reads_needed = DIV_ROUND_UP(reference_block_count, MAX_BLOCKS_PER_VIO);
+ refcount_blocks_per_vio = DIV_ROUND_UP(reference_block_count, refcount_reads_needed);
+ allocator->refcount_blocks_per_big_vio = refcount_blocks_per_vio;
+ result = make_vio_pool(vdo, BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE,
+ allocator->refcount_blocks_per_big_vio, allocator->thread_id,
+ VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
+ NULL, &allocator->refcount_big_vio_pool);
+ if (result != VDO_SUCCESS)
+ return result;
+
result = initialize_slab_scrubber(allocator);
if (result != VDO_SUCCESS)
return result;
@@ -4223,6 +4313,7 @@ void vdo_free_slab_depot(struct slab_depot *depot)
uninitialize_allocator_summary(allocator);
uninitialize_scrubber_vio(&allocator->scrubber);
free_vio_pool(vdo_forget(allocator->vio_pool));
+ free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool));
vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs));
}
diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h
index f234853501ca..fadc0c9d4dc4 100644
--- a/drivers/md/dm-vdo/slab-depot.h
+++ b/drivers/md/dm-vdo/slab-depot.h
@@ -45,6 +45,13 @@
enum {
/* The number of vios in the vio pool is proportional to the throughput of the VDO. */
BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128,
+
+ /*
+ * The number of vios in the vio pool used for loading reference count data. A slab's
+ * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be
+ * plenty.
+ */
+ BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9,
};
/*
@@ -248,7 +255,7 @@ struct vdo_slab {
/* A list of the dirty blocks waiting to be written out */
struct vdo_wait_queue dirty_blocks;
- /* The number of blocks which are currently writing */
+ /* The number of blocks which are currently reading or writing */
size_t active_count;
/* A waiter object for updating the slab summary */
@@ -425,6 +432,10 @@ struct block_allocator {
/* The vio pool for reading and writing block allocator metadata */
struct vio_pool *vio_pool;
+ /* The vio pool for large initial reads of ref count areas */
+ struct vio_pool *refcount_big_vio_pool;
+ /* How many ref count blocks are read per vio at initial load */
+ u32 refcount_blocks_per_big_vio;
/* The dm_kcopyd client for erasing slab journals */
struct dm_kcopyd_client *eraser;
/* Iterator over the slabs to be erased */
diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h
index dbe892b10f26..cdf36e7d7702 100644
--- a/drivers/md/dm-vdo/types.h
+++ b/drivers/md/dm-vdo/types.h
@@ -376,6 +376,9 @@ struct vio {
/* The size of this vio in blocks */
unsigned int block_count;
+ /* The amount of data to be read or written, in bytes */
+ unsigned int io_size;
+
/* The data being read or written. */
char *data;
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index a7e32baab4af..80b608674022 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -31,9 +31,7 @@
#include <linux/completion.h>
#include <linux/device-mapper.h>
-#include <linux/kernel.h>
#include <linux/lz4.h>
-#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/types.h>
@@ -142,12 +140,6 @@ static void finish_vdo_request_queue(void *ptr)
vdo_unregister_allocating_thread();
}
-#ifdef MODULE
-#define MODULE_NAME THIS_MODULE->name
-#else
-#define MODULE_NAME "dm-vdo"
-#endif /* MODULE */
-
static const struct vdo_work_queue_type default_queue_type = {
.start = start_vdo_request_queue,
.finish = finish_vdo_request_queue,
@@ -559,8 +551,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason,
*vdo_ptr = vdo;
snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix),
- "%s%u", MODULE_NAME, instance);
- BUG_ON(vdo->thread_name_prefix[0] == '\0');
+ "vdo%u", instance);
result = vdo_allocate(vdo->thread_config.thread_count,
struct vdo_thread, __func__, &vdo->threads);
if (result != VDO_SUCCESS) {
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
index e710f3c5a972..8fc22fb14196 100644
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@@ -188,14 +188,23 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb
/*
* Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated
- * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a
- * vio associated with the bio.
+ * bio, as it assumes the bio wraps a 4k-multiple buffer that is 4k aligned, but there does not
+ * have to be a vio associated with the bio.
*/
int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
blk_opf_t bi_opf, physical_block_number_t pbn)
{
- int bvec_count, offset, len, i;
+ return vio_reset_bio_with_size(vio, data, vio->block_count * VDO_BLOCK_SIZE,
+ callback, bi_opf, pbn);
+}
+
+int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback,
+ blk_opf_t bi_opf, physical_block_number_t pbn)
+{
+ int bvec_count, offset, i;
struct bio *bio = vio->bio;
+ int vio_size = vio->block_count * VDO_BLOCK_SIZE;
+ int remaining;
bio_reset(bio, bio->bi_bdev, bi_opf);
vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn);
@@ -203,24 +212,23 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
return VDO_SUCCESS;
bio->bi_ioprio = 0;
- bio->bi_io_vec = bio->bi_inline_vecs;
+ bio->bi_io_vec = bio_inline_vecs(bio);
bio->bi_max_vecs = vio->block_count + 1;
- len = VDO_BLOCK_SIZE * vio->block_count;
+ if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d",
+ size, vio_size) != VDO_SUCCESS)
+ size = vio_size;
+ vio->io_size = size;
offset = offset_in_page(data);
- bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+ bvec_count = DIV_ROUND_UP(offset + size, PAGE_SIZE);
+ remaining = size;
- /*
- * If we knew that data was always on one page, or contiguous pages, we wouldn't need the
- * loop. But if we're using vmalloc, it's not impossible that the data is in different
- * pages that can't be merged in bio_add_page...
- */
- for (i = 0; (i < bvec_count) && (len > 0); i++) {
+ for (i = 0; (i < bvec_count) && (remaining > 0); i++) {
struct page *page;
int bytes_added;
int bytes = PAGE_SIZE - offset;
- if (bytes > len)
- bytes = len;
+ if (bytes > remaining)
+ bytes = remaining;
page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data);
bytes_added = bio_add_page(bio, page, bytes, offset);
@@ -232,7 +240,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
}
data += bytes;
- len -= bytes;
+ remaining -= bytes;
offset = 0;
}
@@ -301,6 +309,7 @@ void vio_record_metadata_io_error(struct vio *vio)
* make_vio_pool() - Create a new vio pool.
* @vdo: The vdo.
* @pool_size: The number of vios in the pool.
+ * @block_count: The number of 4k blocks per vio.
* @thread_id: The ID of the thread using this pool.
* @vio_type: The type of vios in the pool.
* @priority: The priority with which vios from the pool should be enqueued.
@@ -309,13 +318,14 @@ void vio_record_metadata_io_error(struct vio *vio)
*
* Return: A success or error code.
*/
-int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
+int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_id_t thread_id,
enum vio_type vio_type, enum vio_priority priority, void *context,
struct vio_pool **pool_ptr)
{
struct vio_pool *pool;
char *ptr;
int result;
+ size_t per_vio_size = VDO_BLOCK_SIZE * block_count;
result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio,
__func__, &pool);
@@ -326,7 +336,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
INIT_LIST_HEAD(&pool->available);
INIT_LIST_HEAD(&pool->busy);
- result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char,
+ result = vdo_allocate(pool_size * per_vio_size, char,
"VIO pool buffer", &pool->buffer);
if (result != VDO_SUCCESS) {
free_vio_pool(pool);
@@ -334,10 +344,10 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
}
ptr = pool->buffer;
- for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) {
+ for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += per_vio_size) {
struct pooled_vio *pooled = &pool->vios[pool->size];
- result = allocate_vio_components(vdo, vio_type, priority, NULL, 1, ptr,
+ result = allocate_vio_components(vdo, vio_type, priority, NULL, block_count, ptr,
&pooled->vio);
if (result != VDO_SUCCESS) {
free_vio_pool(pool);
@@ -345,6 +355,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
}
pooled->context = context;
+ pooled->pool = pool;
list_add_tail(&pooled->pool_entry, &pool->available);
}
@@ -419,12 +430,13 @@ void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter)
}
/**
- * return_vio_to_pool() - Return a vio to the pool
- * @pool: The vio pool.
+ * return_vio_to_pool() - Return a vio to its pool
* @vio: The pooled vio to return.
*/
-void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio)
+void return_vio_to_pool(struct pooled_vio *vio)
{
+ struct vio_pool *pool = vio->pool;
+
VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
"vio pool entry returned on same thread as it was acquired");
diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h
index 3490e9f59b04..4bfcb21901f1 100644
--- a/drivers/md/dm-vdo/vio.h
+++ b/drivers/md/dm-vdo/vio.h
@@ -30,6 +30,8 @@ struct pooled_vio {
void *context;
/* The list entry used by the pool */
struct list_head pool_entry;
+ /* The pool this vio is allocated from */
+ struct vio_pool *pool;
};
/**
@@ -123,6 +125,8 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb
int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
blk_opf_t bi_opf, physical_block_number_t pbn);
+int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback,
+ blk_opf_t bi_opf, physical_block_number_t pbn);
void update_vio_error_stats(struct vio *vio, const char *format, ...)
__printf(2, 3);
@@ -188,12 +192,13 @@ static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio)
struct vio_pool;
-int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
- enum vio_type vio_type, enum vio_priority priority,
- void *context, struct vio_pool **pool_ptr);
+int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count,
+ thread_id_t thread_id, enum vio_type vio_type,
+ enum vio_priority priority, void *context,
+ struct vio_pool **pool_ptr);
void free_vio_pool(struct vio_pool *pool);
bool __must_check is_vio_pool_busy(struct vio_pool *pool);
void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter);
-void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio);
+void return_vio_to_pool(struct pooled_vio *vio);
#endif /* VIO_H */
diff --git a/drivers/md/dm-vdo/wait-queue.c b/drivers/md/dm-vdo/wait-queue.c
index 6e1e739277ef..f81ed0cee2bf 100644
--- a/drivers/md/dm-vdo/wait-queue.c
+++ b/drivers/md/dm-vdo/wait-queue.c
@@ -34,7 +34,7 @@ void vdo_waitq_enqueue_waiter(struct vdo_wait_queue *waitq, struct vdo_waiter *w
waitq->last_waiter->next_waiter = waiter;
}
- /* In both cases, the waiter we added to the ring becomes the last waiter. */
+ /* In both cases, the waiter we added to the list becomes the last waiter. */
waitq->last_waiter = waiter;
waitq->length += 1;
}
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 62b1a44b8dd2..72047b47a7a0 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -40,35 +40,23 @@ static inline u64 fec_interleave(struct dm_verity *v, u64 offset)
}
/*
- * Decode an RS block using Reed-Solomon.
- */
-static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio,
- u8 *data, u8 *fec, int neras)
-{
- int i;
- uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
-
- for (i = 0; i < v->fec->roots; i++)
- par[i] = fec[i];
-
- return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras,
- fio->erasures, 0, NULL);
-}
-
-/*
* Read error-correcting codes for the requested RS block. Returns a pointer
* to the data block. Caller is responsible for releasing buf.
*/
static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index,
- unsigned int *offset, struct dm_buffer **buf,
- unsigned short ioprio)
+ unsigned int *offset, unsigned int par_buf_offset,
+ struct dm_buffer **buf, unsigned short ioprio)
{
u64 position, block, rem;
u8 *res;
+ /* We have already part of parity bytes read, skip to the next block */
+ if (par_buf_offset)
+ index++;
+
position = (index + rsb) * v->fec->roots;
block = div64_u64_rem(position, v->fec->io_size, &rem);
- *offset = (unsigned int)rem;
+ *offset = par_buf_offset ? 0 : (unsigned int)rem;
res = dm_bufio_read_with_ioprio(v->fec->bufio, block, buf, ioprio);
if (IS_ERR(res)) {
@@ -128,11 +116,13 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
{
int r, corrected = 0, res;
struct dm_buffer *buf;
- unsigned int n, i, offset;
+ unsigned int n, i, j, offset, par_buf_offset = 0;
+ uint16_t par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
u8 *par, *block;
struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
- par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio));
+ par = fec_read_parity(v, rsb, block_offset, &offset,
+ par_buf_offset, &buf, bio->bi_ioprio);
if (IS_ERR(par))
return PTR_ERR(par);
@@ -142,7 +132,11 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
*/
fec_for_each_buffer_rs_block(fio, n, i) {
block = fec_buffer_rs_block(v, fio, n, i);
- res = fec_decode_rs8(v, fio, block, &par[offset], neras);
+ for (j = 0; j < v->fec->roots - par_buf_offset; j++)
+ par_buf[par_buf_offset + j] = par[offset + j];
+ /* Decode an RS block using Reed-Solomon */
+ res = decode_rs8(fio->rs, block, par_buf, v->fec->rsn,
+ NULL, neras, fio->erasures, 0, NULL);
if (res < 0) {
r = res;
goto error;
@@ -155,12 +149,22 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
if (block_offset >= 1 << v->data_dev_block_bits)
goto done;
- /* read the next block when we run out of parity bytes */
- offset += v->fec->roots;
+ /* Read the next block when we run out of parity bytes */
+ offset += (v->fec->roots - par_buf_offset);
+ /* Check if parity bytes are split between blocks */
+ if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) {
+ par_buf_offset = v->fec->io_size - offset;
+ for (j = 0; j < par_buf_offset; j++)
+ par_buf[j] = par[offset + j];
+ offset += par_buf_offset;
+ } else
+ par_buf_offset = 0;
+
if (offset >= v->fec->io_size) {
dm_bufio_release(buf);
- par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio));
+ par = fec_read_parity(v, rsb, block_offset, &offset,
+ par_buf_offset, &buf, bio->bi_ioprio);
if (IS_ERR(par))
return PTR_ERR(par);
}
@@ -187,7 +191,7 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
u8 *want_digest, u8 *data)
{
if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits,
- verity_io_real_digest(v, io), true)))
+ verity_io_real_digest(v, io))))
return 0;
return memcmp(verity_io_real_digest(v, io), want_digest,
@@ -250,7 +254,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
bufio = v->bufio;
}
- bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio_prio(bio));
+ bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio->bi_ioprio);
if (IS_ERR(bbuf)) {
DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld",
v->data_dev->name,
@@ -316,11 +320,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
if (fio->bufs[n])
continue;
- fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOWAIT);
- if (unlikely(!fio->bufs[n])) {
- DMERR("failed to allocate FEC buffer");
- return -ENOMEM;
- }
+ fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOIO);
}
/* try to allocate the maximum number of buffers */
@@ -388,7 +388,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
/* Always re-validate the corrected block against the expected hash */
r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits,
- verity_io_real_digest(v, io), true);
+ verity_io_real_digest(v, io));
if (unlikely(r < 0))
return r;
@@ -589,6 +589,10 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
(*argc)--;
if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) {
+ if (v->fec->dev) {
+ ti->error = "FEC device already specified";
+ return -EINVAL;
+ }
r = dm_get_device(ti, arg_value, BLK_OPEN_READ, &v->fec->dev);
if (r) {
ti->error = "FEC device lookup failed";
@@ -724,10 +728,7 @@ int verity_fec_ctr(struct dm_verity *v)
return -E2BIG;
}
- if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1))
- f->io_size = 1 << v->data_dev_block_bits;
- else
- f->io_size = v->fec->roots << SECTOR_SHIFT;
+ f->io_size = 1 << v->data_dev_block_bits;
f->bufio = dm_bufio_client_create(f->dev->bdev,
f->io_size,
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 47d595f6a76e..66a00a8ccb39 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -19,7 +19,6 @@
#include "dm-audit.h"
#include <linux/module.h>
#include <linux/reboot.h>
-#include <linux/scatterlist.h>
#include <linux/string.h>
#include <linux/jump_label.h>
#include <linux/security.h>
@@ -30,6 +29,7 @@
#define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR"
#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
+#define DM_VERITY_USE_BH_DEFAULT_BYTES 8192
#define DM_VERITY_MAX_CORRUPTED_ERRS 100
@@ -49,10 +49,16 @@ static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE
module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644);
-static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled);
+static unsigned int dm_verity_use_bh_bytes[4] = {
+ DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_NONE
+ DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_RT
+ DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_BE
+ 0 // IOPRIO_CLASS_IDLE
+};
-/* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */
-static DEFINE_STATIC_KEY_FALSE(ahash_enabled);
+module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644);
+
+static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled);
struct dm_verity_prefetch_work {
struct work_struct work;
@@ -108,100 +114,21 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
return block >> (level * v->hash_per_block_bits);
}
-static int verity_ahash_update(struct dm_verity *v, struct ahash_request *req,
- const u8 *data, size_t len,
- struct crypto_wait *wait)
-{
- struct scatterlist sg;
-
- if (likely(!is_vmalloc_addr(data))) {
- sg_init_one(&sg, data, len);
- ahash_request_set_crypt(req, &sg, NULL, len);
- return crypto_wait_req(crypto_ahash_update(req), wait);
- }
-
- do {
- int r;
- size_t this_step = min_t(size_t, len, PAGE_SIZE - offset_in_page(data));
-
- flush_kernel_vmap_range((void *)data, this_step);
- sg_init_table(&sg, 1);
- sg_set_page(&sg, vmalloc_to_page(data), this_step, offset_in_page(data));
- ahash_request_set_crypt(req, &sg, NULL, this_step);
- r = crypto_wait_req(crypto_ahash_update(req), wait);
- if (unlikely(r))
- return r;
- data += this_step;
- len -= this_step;
- } while (len);
-
- return 0;
-}
-
-/*
- * Wrapper for crypto_ahash_init, which handles verity salting.
- */
-static int verity_ahash_init(struct dm_verity *v, struct ahash_request *req,
- struct crypto_wait *wait, bool may_sleep)
-{
- int r;
-
- ahash_request_set_tfm(req, v->ahash_tfm);
- ahash_request_set_callback(req,
- may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0,
- crypto_req_done, (void *)wait);
- crypto_init_wait(wait);
-
- r = crypto_wait_req(crypto_ahash_init(req), wait);
-
- if (unlikely(r < 0)) {
- if (r != -ENOMEM)
- DMERR("crypto_ahash_init failed: %d", r);
- return r;
- }
-
- if (likely(v->salt_size && (v->version >= 1)))
- r = verity_ahash_update(v, req, v->salt, v->salt_size, wait);
-
- return r;
-}
-
-static int verity_ahash_final(struct dm_verity *v, struct ahash_request *req,
- u8 *digest, struct crypto_wait *wait)
-{
- int r;
-
- if (unlikely(v->salt_size && (!v->version))) {
- r = verity_ahash_update(v, req, v->salt, v->salt_size, wait);
-
- if (r < 0) {
- DMERR("%s failed updating salt: %d", __func__, r);
- goto out;
- }
- }
-
- ahash_request_set_crypt(req, NULL, digest, 0);
- r = crypto_wait_req(crypto_ahash_final(req), wait);
-out:
- return r;
-}
-
int verity_hash(struct dm_verity *v, struct dm_verity_io *io,
- const u8 *data, size_t len, u8 *digest, bool may_sleep)
+ const u8 *data, size_t len, u8 *digest)
{
+ struct shash_desc *desc = &io->hash_desc;
int r;
- if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) {
- struct ahash_request *req = verity_io_hash_req(v, io);
- struct crypto_wait wait;
-
- r = verity_ahash_init(v, req, &wait, may_sleep) ?:
- verity_ahash_update(v, req, data, len, &wait) ?:
- verity_ahash_final(v, req, digest, &wait);
+ desc->tfm = v->shash_tfm;
+ if (unlikely(v->initial_hashstate == NULL)) {
+ /* Version 0: salt at end */
+ r = crypto_shash_init(desc) ?:
+ crypto_shash_update(desc, data, len) ?:
+ crypto_shash_update(desc, v->salt, v->salt_size) ?:
+ crypto_shash_final(desc, digest);
} else {
- struct shash_desc *desc = verity_io_hash_req(v, io);
-
- desc->tfm = v->shash_tfm;
+ /* Version 1: salt at beginning */
r = crypto_shash_import(desc, v->initial_hashstate) ?:
crypto_shash_finup(desc, data, len, digest);
}
@@ -311,7 +238,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
data = dm_bufio_get(v->bufio, hash_block, &buf);
- if (data == NULL) {
+ if (IS_ERR_OR_NULL(data)) {
/*
* In tasklet and the hash was not in the bufio cache.
* Return early and resume execution from a work-queue
@@ -321,11 +248,27 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
}
} else {
data = dm_bufio_read_with_ioprio(v->bufio, hash_block,
- &buf, bio_prio(bio));
+ &buf, bio->bi_ioprio);
}
- if (IS_ERR(data))
- return PTR_ERR(data);
+ if (IS_ERR(data)) {
+ if (skip_unverified)
+ return 1;
+ r = PTR_ERR(data);
+ data = dm_bufio_new(v->bufio, hash_block, &buf);
+ if (IS_ERR(data))
+ return r;
+ if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA,
+ hash_block, data) == 0) {
+ aux = dm_bufio_get_aux_data(buf);
+ aux->hash_verified = 1;
+ goto release_ok;
+ } else {
+ dm_bufio_release(buf);
+ dm_bufio_forget(v->bufio, hash_block);
+ return r;
+ }
+ }
aux = dm_bufio_get_aux_data(buf);
@@ -336,7 +279,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
}
r = verity_hash(v, io, data, 1 << v->hash_dev_block_bits,
- verity_io_real_digest(v, io), !io->in_bh);
+ verity_io_real_digest(v, io));
if (unlikely(r < 0))
goto release_ret_r;
@@ -366,6 +309,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
}
}
+release_ok:
data += offset;
memcpy(want_digest, data, v->digest_size);
r = 0;
@@ -438,7 +382,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
goto free_ret;
r = verity_hash(v, io, buffer, 1 << v->data_dev_block_bits,
- verity_io_real_digest(v, io), true);
+ verity_io_real_digest(v, io));
if (unlikely(r))
goto free_ret;
@@ -554,7 +498,7 @@ static int verity_verify_io(struct dm_verity_io *io)
}
r = verity_hash(v, io, data, block_size,
- verity_io_real_digest(v, io), !io->in_bh);
+ verity_io_real_digest(v, io));
if (unlikely(r < 0)) {
kunmap_local(data);
return r;
@@ -652,9 +596,18 @@ static void verity_bh_work(struct work_struct *w)
verity_finish_io(io, errno_to_blk_status(err));
}
+static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio)
+{
+ return ioprio <= IOPRIO_CLASS_IDLE &&
+ bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]) &&
+ !need_resched();
+}
+
static void verity_end_io(struct bio *bio)
{
struct dm_verity_io *io = bio->bi_private;
+ unsigned short ioprio = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+ unsigned int bytes = io->n_blocks << io->v->data_dev_block_bits;
if (bio->bi_status &&
(!verity_fec_is_enabled(io->v) ||
@@ -664,9 +617,14 @@ static void verity_end_io(struct bio *bio)
return;
}
- if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq) {
- INIT_WORK(&io->bh_work, verity_bh_work);
- queue_work(system_bh_wq, &io->bh_work);
+ if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq &&
+ verity_use_bh(bytes, ioprio)) {
+ if (in_hardirq() || irqs_disabled()) {
+ INIT_WORK(&io->bh_work, verity_bh_work);
+ queue_work(system_bh_wq, &io->bh_work);
+ } else {
+ verity_bh_work(&io->bh_work);
+ }
} else {
INIT_WORK(&io->work, verity_work);
queue_work(io->v->verify_wq, &io->work);
@@ -789,13 +747,20 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
verity_fec_init_io(io);
- verity_submit_prefetch(v, io, bio_prio(bio));
+ verity_submit_prefetch(v, io, bio->bi_ioprio);
submit_bio_noacct(bio);
return DM_MAPIO_SUBMITTED;
}
+static void verity_postsuspend(struct dm_target *ti)
+{
+ struct dm_verity *v = ti->private;
+ flush_workqueue(v->verify_wq);
+ dm_bufio_client_reset(v->bufio);
+}
+
/*
* Status: V (valid) or C (corruption found)
*/
@@ -946,7 +911,9 @@ static void verity_status(struct dm_target *ti, status_type_t type,
}
}
-static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct dm_verity *v = ti->private;
@@ -1042,12 +1009,7 @@ static void verity_dtr(struct dm_target *ti)
kfree(v->zero_digest);
verity_free_sig(v);
- if (v->ahash_tfm) {
- static_branch_dec(&ahash_enabled);
- crypto_free_ahash(v->ahash_tfm);
- } else {
- crypto_free_shash(v->shash_tfm);
- }
+ crypto_free_shash(v->shash_tfm);
kfree(v->alg_name);
@@ -1073,6 +1035,9 @@ static int verity_alloc_most_once(struct dm_verity *v)
{
struct dm_target *ti = v->ti;
+ if (v->validated_blocks)
+ return 0;
+
/* the bitset can only handle INT_MAX blocks */
if (v->data_blocks > INT_MAX) {
ti->error = "device too large to use check_at_most_once";
@@ -1096,12 +1061,16 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
struct dm_verity_io *io;
u8 *zero_data;
+ if (v->zero_digest)
+ return 0;
+
v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
if (!v->zero_digest)
return r;
- io = kmalloc(sizeof(*io) + v->hash_reqsize, GFP_KERNEL);
+ io = kmalloc(sizeof(*io) + crypto_shash_descsize(v->shash_tfm),
+ GFP_KERNEL);
if (!io)
return r; /* verity_dtr will free zero_digest */
@@ -1112,7 +1081,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
goto out;
r = verity_hash(v, io, zero_data, 1 << v->data_dev_block_bits,
- v->zero_digest, true);
+ v->zero_digest);
out:
kfree(io);
@@ -1268,9 +1237,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name)
{
struct dm_target *ti = v->ti;
- struct crypto_ahash *ahash;
- struct crypto_shash *shash = NULL;
- const char *driver_name;
+ struct crypto_shash *shash;
v->alg_name = kstrdup(alg_name, GFP_KERNEL);
if (!v->alg_name) {
@@ -1278,50 +1245,14 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name)
return -ENOMEM;
}
- /*
- * Allocate the hash transformation object that this dm-verity instance
- * will use. The vast majority of dm-verity users use CPU-based
- * hashing, so when possible use the shash API to minimize the crypto
- * API overhead. If the ahash API resolves to a different driver
- * (likely an off-CPU hardware offload), use ahash instead. Also use
- * ahash if the obsolete dm-verity format with the appended salt is
- * being used, so that quirk only needs to be handled in one place.
- */
- ahash = crypto_alloc_ahash(alg_name, 0,
- v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0);
- if (IS_ERR(ahash)) {
+ shash = crypto_alloc_shash(alg_name, 0, 0);
+ if (IS_ERR(shash)) {
ti->error = "Cannot initialize hash function";
- return PTR_ERR(ahash);
- }
- driver_name = crypto_ahash_driver_name(ahash);
- if (v->version >= 1 /* salt prepended, not appended? */) {
- shash = crypto_alloc_shash(alg_name, 0, 0);
- if (!IS_ERR(shash) &&
- strcmp(crypto_shash_driver_name(shash), driver_name) != 0) {
- /*
- * ahash gave a different driver than shash, so probably
- * this is a case of real hardware offload. Use ahash.
- */
- crypto_free_shash(shash);
- shash = NULL;
- }
- }
- if (!IS_ERR_OR_NULL(shash)) {
- crypto_free_ahash(ahash);
- ahash = NULL;
- v->shash_tfm = shash;
- v->digest_size = crypto_shash_digestsize(shash);
- v->hash_reqsize = sizeof(struct shash_desc) +
- crypto_shash_descsize(shash);
- DMINFO("%s using shash \"%s\"", alg_name, driver_name);
- } else {
- v->ahash_tfm = ahash;
- static_branch_inc(&ahash_enabled);
- v->digest_size = crypto_ahash_digestsize(ahash);
- v->hash_reqsize = sizeof(struct ahash_request) +
- crypto_ahash_reqsize(ahash);
- DMINFO("%s using ahash \"%s\"", alg_name, driver_name);
+ return PTR_ERR(shash);
}
+ v->shash_tfm = shash;
+ v->digest_size = crypto_shash_digestsize(shash);
+ DMINFO("%s using \"%s\"", alg_name, crypto_shash_driver_name(shash));
if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
ti->error = "Digest size too big";
return -EINVAL;
@@ -1346,7 +1277,7 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg)
return -EINVAL;
}
}
- if (v->shash_tfm) {
+ if (v->version) { /* Version 1: salt at beginning */
SHASH_DESC_ON_STACK(desc, v->shash_tfm);
int r;
@@ -1530,7 +1461,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- /* Root hash signature is a optional parameter*/
+ /* Root hash signature is an optional parameter */
r = verity_verify_root_hash(root_hash_digest_to_validate,
strlen(root_hash_digest_to_validate),
verify_args.sig,
@@ -1625,7 +1556,8 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- ti->per_io_data_size = sizeof(struct dm_verity_io) + v->hash_reqsize;
+ ti->per_io_data_size = sizeof(struct dm_verity_io) +
+ crypto_shash_descsize(v->shash_tfm);
r = verity_fec_ctr(v);
if (r)
@@ -1732,10 +1664,7 @@ static int verity_preresume(struct dm_target *ti)
bdev = dm_disk(dm_table_get_md(ti->table))->part0;
root_digest.digest = v->root_digest;
root_digest.digest_len = v->digest_size;
- if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm)
- root_digest.alg = crypto_ahash_alg_name(v->ahash_tfm);
- else
- root_digest.alg = crypto_shash_alg_name(v->shash_tfm);
+ root_digest.alg = crypto_shash_alg_name(v->shash_tfm);
r = security_bdev_setintegrity(bdev, LSM_INT_DMVERITY_ROOTHASH, &root_digest,
sizeof(root_digest));
@@ -1761,11 +1690,12 @@ static struct target_type verity_target = {
.name = "verity",
/* Note: the LSMs depend on the singleton and immutable features */
.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
- .version = {1, 10, 0},
+ .version = {1, 12, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
.map = verity_map,
+ .postsuspend = verity_postsuspend,
.status = verity_status,
.prepare_ioctl = verity_prepare_ioctl,
.iterate_devices = verity_iterate_devices,
diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c
index a9e2c6c0a33c..d5261a0e4232 100644
--- a/drivers/md/dm-verity-verify-sig.c
+++ b/drivers/md/dm-verity-verify-sig.c
@@ -71,9 +71,14 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
const char *arg_name)
{
struct dm_target *ti = v->ti;
- int ret = 0;
+ int ret;
const char *sig_key = NULL;
+ if (v->signature_key_desc) {
+ ti->error = DM_VERITY_VERIFY_ERR("root_hash_sig_key_desc already specified");
+ return -EINVAL;
+ }
+
if (!*argc) {
ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified");
return -EINVAL;
@@ -83,14 +88,18 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
(*argc)--;
ret = verity_verify_get_sig_from_key(sig_key, sig_opts);
- if (ret < 0)
+ if (ret < 0) {
ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified");
+ return ret;
+ }
v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL);
- if (!v->signature_key_desc)
+ if (!v->signature_key_desc) {
+ ti->error = DM_VERITY_VERIFY_ERR("Could not allocate memory for signature key");
return -ENOMEM;
+ }
- return ret;
+ return 0;
}
/*
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 8cbb57862ae1..6d141abd965c 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -39,11 +39,10 @@ struct dm_verity {
struct dm_target *ti;
struct dm_bufio_client *bufio;
char *alg_name;
- struct crypto_ahash *ahash_tfm; /* either this or shash_tfm is set */
- struct crypto_shash *shash_tfm; /* either this or ahash_tfm is set */
+ struct crypto_shash *shash_tfm;
u8 *root_digest; /* digest of the root block */
u8 *salt; /* salt: its size is salt_size */
- u8 *initial_hashstate; /* salted initial state, if shash_tfm is set */
+ u8 *initial_hashstate; /* salted initial state, if version >= 1 */
u8 *zero_digest; /* digest for a zero block */
#ifdef CONFIG_SECURITY
u8 *root_digest_sig; /* signature of the root digest */
@@ -61,7 +60,6 @@ struct dm_verity {
bool hash_failed:1; /* set if hash of any block failed */
bool use_bh_wq:1; /* try to verify in BH wq before normal work-queue */
unsigned int digest_size; /* digest size for the current hash algorithm */
- unsigned int hash_reqsize; /* the size of temporary space for crypto */
enum verity_mode mode; /* mode for handling verification errors */
enum verity_mode error_mode;/* mode for handling I/O errors */
unsigned int corrupted_errs;/* Number of errors for corrupted blocks */
@@ -100,19 +98,13 @@ struct dm_verity_io {
u8 want_digest[HASH_MAX_DIGESTSIZE];
/*
- * This struct is followed by a variable-sized hash request of size
- * v->hash_reqsize, either a struct ahash_request or a struct shash_desc
- * (depending on whether ahash_tfm or shash_tfm is being used). To
- * access it, use verity_io_hash_req().
+ * Temporary space for hashing. This is variable-length and must be at
+ * the end of the struct. struct shash_desc is just the fixed part;
+ * it's followed by a context of size crypto_shash_descsize(shash_tfm).
*/
+ struct shash_desc hash_desc;
};
-static inline void *verity_io_hash_req(struct dm_verity *v,
- struct dm_verity_io *io)
-{
- return io + 1;
-}
-
static inline u8 *verity_io_real_digest(struct dm_verity *v,
struct dm_verity_io *io)
{
@@ -126,7 +118,7 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v,
}
extern int verity_hash(struct dm_verity *v, struct dm_verity_io *io,
- const u8 *data, size_t len, u8 *digest, bool may_sleep);
+ const u8 *data, size_t len, u8 *digest);
extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
sector_t block, u8 *digest, bool *is_zero);
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 7ce8847b3404..d8de4a3076a1 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -13,7 +13,6 @@
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
#include <linux/dax.h>
-#include <linux/pfn_t.h>
#include <linux/libnvdimm.h>
#include <linux/delay.h>
#include "dm-io-tracker.h"
@@ -256,7 +255,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
int r;
loff_t s;
long p, da;
- pfn_t pfn;
+ unsigned long pfn;
int id;
struct page **pages;
sector_t offset;
@@ -290,7 +289,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
r = da;
goto err2;
}
- if (!pfn_t_has_page(pfn)) {
+ if (!pfn_valid(pfn)) {
wc->memory_map = NULL;
r = -EOPNOTSUPP;
goto err2;
@@ -314,13 +313,13 @@ static int persistent_memory_claim(struct dm_writecache *wc)
r = daa ? daa : -EINVAL;
goto err3;
}
- if (!pfn_t_has_page(pfn)) {
+ if (!pfn_valid(pfn)) {
r = -EOPNOTSUPP;
goto err3;
}
while (daa-- && i < p) {
- pages[i++] = pfn_t_to_page(pfn);
- pfn.val++;
+ pages[i++] = pfn_to_page(pfn);
+ pfn++;
if (!(i & 15))
cond_resched();
}
@@ -706,7 +705,7 @@ static inline void writecache_verify_watermark(struct dm_writecache *wc)
static void writecache_max_age_timer(struct timer_list *t)
{
- struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
+ struct dm_writecache *wc = timer_container_of(wc, t, max_age_timer);
if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
queue_work(wc->writeback_wq, &wc->writeback_work);
@@ -797,7 +796,7 @@ static void writecache_flush(struct dm_writecache *wc)
bool need_flush_after_free;
wc->uncommitted_blocks = 0;
- del_timer(&wc->autocommit_timer);
+ timer_delete(&wc->autocommit_timer);
if (list_empty(&wc->lru))
return;
@@ -866,7 +865,7 @@ static void writecache_flush_work(struct work_struct *work)
static void writecache_autocommit_timer(struct timer_list *t)
{
- struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
+ struct dm_writecache *wc = timer_container_of(wc, t, autocommit_timer);
if (!writecache_has_error(wc))
queue_work(wc->writeback_wq, &wc->flush_work);
@@ -927,8 +926,8 @@ static void writecache_suspend(struct dm_target *ti)
struct dm_writecache *wc = ti->private;
bool flush_on_suspend;
- del_timer_sync(&wc->autocommit_timer);
- del_timer_sync(&wc->max_age_timer);
+ timer_delete_sync(&wc->autocommit_timer);
+ timer_delete_sync(&wc->max_age_timer);
wc_lock(wc);
writecache_flush(wc);
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 20edd3fabbab..5a840c4ae316 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -17,33 +17,26 @@
* For internal zone reports bypassing the top BIO submission path.
*/
static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
- sector_t sector, unsigned int nr_zones,
- report_zones_cb cb, void *data)
+ unsigned int nr_zones,
+ struct dm_report_zones_args *args)
{
- struct gendisk *disk = md->disk;
- int ret;
- struct dm_report_zones_args args = {
- .next_sector = sector,
- .orig_data = data,
- .orig_cb = cb,
- };
-
do {
struct dm_target *tgt;
+ int ret;
- tgt = dm_table_find_target(t, args.next_sector);
+ tgt = dm_table_find_target(t, args->next_sector);
if (WARN_ON_ONCE(!tgt->type->report_zones))
return -EIO;
- args.tgt = tgt;
- ret = tgt->type->report_zones(tgt, &args,
- nr_zones - args.zone_idx);
+ args->tgt = tgt;
+ ret = tgt->type->report_zones(tgt, args,
+ nr_zones - args->zone_idx);
if (ret < 0)
return ret;
- } while (args.zone_idx < nr_zones &&
- args.next_sector < get_capacity(disk));
+ } while (args->zone_idx < nr_zones &&
+ args->next_sector < get_capacity(md->disk));
- return args.zone_idx;
+ return args->zone_idx;
}
/*
@@ -52,28 +45,41 @@ static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
* generally implemented by targets using dm_report_zones().
*/
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args)
{
struct mapped_device *md = disk->private_data;
struct dm_table *map;
- int srcu_idx, ret;
+ struct dm_table *zone_revalidate_map = md->zone_revalidate_map;
+ int srcu_idx, ret = -EIO;
+ bool put_table = false;
- if (!md->zone_revalidate_map) {
- /* Regular user context */
+ if (!zone_revalidate_map || md->revalidate_map_task != current) {
+ /*
+ * Regular user context or
+ * Zone revalidation during __bind() is in progress, but this
+ * call is from a different process
+ */
if (dm_suspended_md(md))
return -EAGAIN;
map = dm_get_live_table(md, &srcu_idx);
- if (!map)
- return -EIO;
+ put_table = true;
} else {
/* Zone revalidation during __bind() */
- map = md->zone_revalidate_map;
+ map = zone_revalidate_map;
}
- ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
+ if (map) {
+ struct dm_report_zones_args dm_args = {
+ .disk = md->disk,
+ .next_sector = sector,
+ .rep_args = args,
+ };
+ ret = dm_blk_do_report_zones(md, map, nr_zones, &dm_args);
+ }
- if (!md->zone_revalidate_map)
+ if (put_table)
dm_put_live_table(md, srcu_idx);
return ret;
@@ -106,7 +112,18 @@ static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
}
args->next_sector = zone->start + zone->len;
- return args->orig_cb(zone, args->zone_idx++, args->orig_data);
+
+ /* If we have an internal callback, call it first. */
+ if (args->cb) {
+ int ret;
+
+ ret = args->cb(zone, args->zone_idx, args->data);
+ if (ret)
+ return ret;
+ }
+
+ return disk_report_zone(args->disk, zone, args->zone_idx++,
+ args->rep_args);
}
/*
@@ -153,33 +170,36 @@ int dm_revalidate_zones(struct dm_table *t, struct request_queue *q)
{
struct mapped_device *md = t->md;
struct gendisk *disk = md->disk;
+ unsigned int nr_zones = disk->nr_zones;
int ret;
if (!get_capacity(disk))
return 0;
- /* Revalidate only if something changed. */
- if (!disk->nr_zones || disk->nr_zones != md->nr_zones) {
- DMINFO("%s using %s zone append",
- disk->disk_name,
- queue_emulates_zone_append(q) ? "emulated" : "native");
- md->nr_zones = 0;
- }
-
- if (md->nr_zones)
+ /*
+ * Do not revalidate if zone write plug resources have already
+ * been allocated.
+ */
+ if (dm_has_zone_plugs(md))
return 0;
+ DMINFO("%s using %s zone append", disk->disk_name,
+ queue_emulates_zone_append(q) ? "emulated" : "native");
+
/*
* Our table is not live yet. So the call to dm_get_live_table()
* in dm_blk_report_zones() will fail. Set a temporary pointer to
* our table for dm_blk_report_zones() to use directly.
*/
md->zone_revalidate_map = t;
+ md->revalidate_map_task = current;
ret = blk_revalidate_disk_zones(disk);
+ md->revalidate_map_task = NULL;
md->zone_revalidate_map = NULL;
if (ret) {
DMERR("Revalidate zones failed %d", ret);
+ disk->nr_zones = nr_zones;
return ret;
}
@@ -337,15 +357,15 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
/*
* Check if zone append is natively supported, and if not, set the
- * mapped device queue as needing zone append emulation.
+ * mapped device queue as needing zone append emulation. If zone
+ * append is natively supported, make sure that
+ * max_hw_zone_append_sectors is not set to 0.
*/
WARN_ON_ONCE(queue_is_mq(q));
- if (dm_table_supports_zone_append(t)) {
- clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
- } else {
- set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ if (!dm_table_supports_zone_append(t))
lim->max_hw_zone_append_sectors = 0;
- }
+ else if (lim->max_hw_zone_append_sectors == 0)
+ lim->max_hw_zone_append_sectors = lim->max_zone_append_sectors;
/*
* Determine the max open and max active zone limits for the mapped
@@ -380,15 +400,28 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
lim->max_open_zones = 0;
lim->max_active_zones = 0;
lim->max_hw_zone_append_sectors = 0;
+ lim->max_zone_append_sectors = 0;
lim->zone_write_granularity = 0;
lim->chunk_sectors = 0;
lim->features &= ~BLK_FEAT_ZONED;
- clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
- md->nr_zones = 0;
- disk->nr_zones = 0;
return 0;
}
+ if (get_capacity(disk) && dm_has_zone_plugs(t->md)) {
+ if (q->limits.chunk_sectors != lim->chunk_sectors) {
+ DMWARN("%s: device has zone write plug resources. "
+ "Cannot change zone size",
+ disk->disk_name);
+ return -EINVAL;
+ }
+ if (lim->max_hw_zone_append_sectors != 0 &&
+ !dm_table_is_wildcard(t)) {
+ DMWARN("%s: device has zone write plug resources. "
+ "New table must emulate zone append",
+ disk->disk_name);
+ return -EINVAL;
+ }
+ }
/*
* Warn once (when the capacity is not yet set) if the mapped device is
* partially using zone resources of the target devices as that leads to
@@ -408,6 +441,23 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
return 0;
}
+void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim)
+{
+ struct mapped_device *md = t->md;
+
+ if (lim->features & BLK_FEAT_ZONED) {
+ if (dm_table_supports_zone_append(t))
+ clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ else
+ set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ } else {
+ clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ md->nr_zones = 0;
+ md->disk->nr_zones = 0;
+ }
+}
+
+
/*
* IO completion callback called from clone_endio().
*/
@@ -423,12 +473,10 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone)
*/
if (clone->bi_status == BLK_STS_OK &&
bio_op(clone) == REQ_OP_ZONE_APPEND) {
- sector_t mask = bdev_zone_sectors(disk->part0) - 1;
-
- orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask;
+ orig_bio->bi_iter.bi_sector +=
+ bdev_offset_from_zone_start(disk->part0,
+ clone->bi_iter.bi_sector);
}
-
- return;
}
static int dm_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
@@ -454,10 +502,15 @@ int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
sector_t sector, unsigned int nr_zones,
unsigned long *need_reset)
{
+ struct dm_report_zones_args args = {
+ .disk = md->disk,
+ .next_sector = sector,
+ .cb = dm_zone_need_reset_cb,
+ .data = need_reset,
+ };
int ret;
- ret = dm_blk_do_report_zones(md, t, sector, nr_zones,
- dm_zone_need_reset_cb, need_reset);
+ ret = dm_blk_do_report_zones(md, t, nr_zones, &args);
if (ret != nr_zones) {
DMERR("Get %s zone reset bitmap failed\n",
md->disk->disk_name);
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index d58db9a27e6c..76e2c6868548 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -76,9 +76,9 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
* pointer and the requested position.
*/
nr_blocks = block - wp_block;
- ret = blkdev_issue_zeroout(dev->bdev,
- dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
- dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
+ ret = blk_zone_issue_zeroout(dev->bdev,
+ dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
+ dmz_blk2sect(nr_blocks), GFP_NOIO);
if (ret) {
dmz_dev_err(dev,
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 6141fc25d842..9da329078ea4 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -1015,7 +1015,8 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
/*
* Pass on ioctl to the backend device.
*/
-static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg, bool *forward)
{
struct dmz_target *dmz = ti->private;
struct dmz_dev *dev = &dmz->dev[0];
@@ -1061,7 +1062,7 @@ static int dmz_iterate_devices(struct dm_target *ti,
struct dmz_target *dmz = ti->private;
unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
sector_t capacity;
- int i, r;
+ int i, r = 0;
for (i = 0; i < dmz->nr_ddevs; i++) {
capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 12ecf07a3841..6c83ab940af7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -403,15 +403,16 @@ static void do_deferred_remove(struct work_struct *w)
dm_deferred_remove();
}
-static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int dm_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct mapped_device *md = bdev->bd_disk->private_data;
+ struct mapped_device *md = disk->private_data;
return dm_get_geometry(md, geo);
}
static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
- struct block_device **bdev)
+ struct block_device **bdev, unsigned int cmd,
+ unsigned long arg, bool *forward)
{
struct dm_target *ti;
struct dm_table *map;
@@ -434,8 +435,8 @@ retry:
if (dm_suspended_md(md))
return -EAGAIN;
- r = ti->type->prepare_ioctl(ti, bdev);
- if (r == -ENOTCONN && !fatal_signal_pending(current)) {
+ r = ti->type->prepare_ioctl(ti, bdev, cmd, arg, forward);
+ if (r == -ENOTCONN && *forward && !fatal_signal_pending(current)) {
dm_put_live_table(md, *srcu_idx);
fsleep(10000);
goto retry;
@@ -454,9 +455,10 @@ static int dm_blk_ioctl(struct block_device *bdev, blk_mode_t mode,
{
struct mapped_device *md = bdev->bd_disk->private_data;
int r, srcu_idx;
+ bool forward = true;
- r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
- if (r < 0)
+ r = dm_prepare_ioctl(md, &srcu_idx, &bdev, cmd, arg, &forward);
+ if (!forward || r < 0)
goto out;
if (r > 0) {
@@ -488,18 +490,13 @@ u64 dm_start_time_ns_from_clone(struct bio *bio)
}
EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
-static inline bool bio_is_flush_with_data(struct bio *bio)
-{
- return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
-}
-
static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio)
{
/*
* If REQ_PREFLUSH set, don't account payload, it will be
* submitted (and accounted) after this flush completes.
*/
- if (bio_is_flush_with_data(bio))
+ if (io->requeue_flush_with_data)
return 0;
if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT)))
return io->sectors;
@@ -588,6 +585,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio, gfp_t g
io = container_of(tio, struct dm_io, tio);
io->magic = DM_IO_MAGIC;
io->status = BLK_STS_OK;
+ io->requeue_flush_with_data = false;
/* one ref is for submission, the other is for completion */
atomic_set(&io->io_count, 2);
@@ -946,6 +944,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage)
struct mapped_device *md = io->md;
blk_status_t io_error;
bool requeued;
+ bool requeue_flush_with_data;
requeued = dm_handle_requeue(io, first_stage);
if (requeued && first_stage)
@@ -962,6 +961,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage)
__dm_start_io_acct(io);
dm_end_io_acct(io);
}
+ requeue_flush_with_data = io->requeue_flush_with_data;
free_io(io);
smp_wmb();
this_cpu_dec(*md->pending_io);
@@ -974,7 +974,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage)
if (requeued)
return;
- if (bio_is_flush_with_data(bio)) {
+ if (unlikely(requeue_flush_with_data)) {
/*
* Preflush done for flush with data, reissue
* without REQ_PREFLUSH.
@@ -1022,10 +1022,8 @@ static void dm_wq_requeue_work(struct work_struct *work)
*
* 2) io->orig_bio points to new cloned bio which matches the requeued dm_io.
*/
-static void dm_io_complete(struct dm_io *io)
+static inline void dm_io_complete(struct dm_io *io)
{
- bool first_requeue;
-
/*
* Only dm_io that has been split needs two stage requeue, otherwise
* we may run into long bio clone chain during suspend and OOM could
@@ -1034,12 +1032,7 @@ static void dm_io_complete(struct dm_io *io)
* Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they
* also aren't handled via the first stage requeue.
*/
- if (dm_io_flagged(io, DM_IO_WAS_SPLIT))
- first_requeue = true;
- else
- first_requeue = false;
-
- __dm_io_complete(io, first_requeue);
+ __dm_io_complete(io, dm_io_flagged(io, DM_IO_WAS_SPLIT));
}
/*
@@ -1082,22 +1075,6 @@ static inline struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
return &md->queue->limits;
}
-void disable_discard(struct mapped_device *md)
-{
- struct queue_limits *limits = dm_get_queue_limits(md);
-
- /* device doesn't really support DISCARD, disable it */
- limits->max_hw_discard_sectors = 0;
-}
-
-void disable_write_zeroes(struct mapped_device *md)
-{
- struct queue_limits *limits = dm_get_queue_limits(md);
-
- /* device doesn't really support WRITE ZEROES, disable it */
- limits->max_write_zeroes_sectors = 0;
-}
-
static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
{
return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
@@ -1115,10 +1092,10 @@ static void clone_endio(struct bio *bio)
if (unlikely(error == BLK_STS_TARGET)) {
if (bio_op(bio) == REQ_OP_DISCARD &&
!bdev_max_discard_sectors(bio->bi_bdev))
- disable_discard(md);
+ blk_queue_disable_discard(md->queue);
else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
!bdev_write_zeroes_sectors(bio->bi_bdev))
- disable_write_zeroes(md);
+ blk_queue_disable_write_zeroes(md->queue);
}
if (static_branch_unlikely(&zoned_enabled) &&
@@ -1232,7 +1209,7 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
long nr_pages, enum dax_access_mode mode, void **kaddr,
- pfn_t *pfn)
+ unsigned long *pfn)
{
struct mapped_device *md = dax_get_private(dax_dev);
sector_t sector = pgoff * PAGE_SECTORS;
@@ -1307,8 +1284,9 @@ out:
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
* allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
- * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by
- * __send_duplicate_bios().
+ * operations, zone append writes (native with REQ_OP_ZONE_APPEND or emulated
+ * with write BIOs flagged with BIO_EMULATES_ZONE_APPEND) and any bio serviced
+ * by __send_duplicate_bios().
*
* dm_accept_partial_bio informs the dm that the target only wants to process
* additional n_sectors sectors of the bio and the rest of the data should be
@@ -1341,11 +1319,19 @@ void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors)
unsigned int bio_sectors = bio_sectors(bio);
BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
- BUG_ON(op_is_zone_mgmt(bio_op(bio)));
- BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
BUG_ON(bio_sectors > *tio->len_ptr);
BUG_ON(n_sectors > bio_sectors);
+ if (static_branch_unlikely(&zoned_enabled) &&
+ unlikely(bdev_is_zoned(bio->bi_bdev))) {
+ enum req_op op = bio_op(bio);
+
+ BUG_ON(op_is_zone_mgmt(op));
+ BUG_ON(op == REQ_OP_WRITE);
+ BUG_ON(op == REQ_OP_WRITE_ZEROES);
+ BUG_ON(op == REQ_OP_ZONE_APPEND);
+ }
+
*tio->len_ptr -= bio_sectors - n_sectors;
bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
@@ -1479,12 +1465,12 @@ static void setup_split_accounting(struct clone_info *ci, unsigned int len)
static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
struct dm_target *ti, unsigned int num_bios,
- unsigned *len, gfp_t gfp_flag)
+ unsigned *len)
{
struct bio *bio;
- int try = (gfp_flag & GFP_NOWAIT) ? 0 : 1;
+ int try;
- for (; try < 2; try++) {
+ for (try = 0; try < 2; try++) {
int bio_nr;
if (try && num_bios > 1)
@@ -1508,8 +1494,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
}
static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
- unsigned int num_bios, unsigned int *len,
- gfp_t gfp_flag)
+ unsigned int num_bios, unsigned int *len)
{
struct bio_list blist = BIO_EMPTY_LIST;
struct bio *clone;
@@ -1526,7 +1511,7 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
* Using alloc_multiple_bios(), even if num_bios is 1, to consistently
* support allocating using GFP_NOWAIT with GFP_NOIO fallback.
*/
- alloc_multiple_bios(&blist, ci, ti, num_bios, len, gfp_flag);
+ alloc_multiple_bios(&blist, ci, ti, num_bios, len);
while ((clone = bio_list_pop(&blist))) {
if (num_bios > 1)
dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO);
@@ -1541,14 +1526,18 @@ static void __send_empty_flush(struct clone_info *ci)
{
struct dm_table *t = ci->map;
struct bio flush_bio;
+ blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+
+ if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) ==
+ (REQ_IDLE | REQ_SYNC))
+ opf |= REQ_IDLE;
/*
* Use an on-stack bio for this, it's safe since we don't
* need to reference it after submit. It's just used as
* the basis for the clone(s).
*/
- bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
- REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
+ bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf);
ci->bio = &flush_bio;
ci->sector_count = 0;
@@ -1564,7 +1553,7 @@ static void __send_empty_flush(struct clone_info *ci)
atomic_add(ti->num_flush_bios, &ci->io->io_count);
bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
- NULL, GFP_NOWAIT);
+ NULL);
atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
}
} else {
@@ -1612,7 +1601,7 @@ static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti,
__max_io_len(ti, ci->sector, max_granularity, max_sectors));
atomic_add(num_bios, &ci->io->io_count);
- bios = __send_duplicate_bios(ci, ti, num_bios, &len, GFP_NOIO);
+ bios = __send_duplicate_bios(ci, ti, num_bios, &len);
/*
* alloc_io() takes one extra reference for submission, so the
* reference won't reach 0 without the following (+1) subtraction
@@ -1746,6 +1735,9 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci)
ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
+ if (ci->bio->bi_opf & REQ_ATOMIC && len != ci->sector_count)
+ return BLK_STS_IOERR;
+
setup_split_accounting(ci, len);
if (unlikely(ci->bio->bi_opf & REQ_NOWAIT)) {
@@ -1784,19 +1776,35 @@ static void init_clone_info(struct clone_info *ci, struct dm_io *io,
}
#ifdef CONFIG_BLK_DEV_ZONED
-static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
- struct bio *bio)
+static inline bool dm_zone_bio_needs_split(struct bio *bio)
{
/*
- * For mapped device that need zone append emulation, we must
- * split any large BIO that straddles zone boundaries.
+ * Special case the zone operations that cannot or should not be split.
*/
- return dm_emulate_zone_append(md) && bio_straddles_zones(bio) &&
- !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+ switch (bio_op(bio)) {
+ case REQ_OP_ZONE_APPEND:
+ case REQ_OP_ZONE_FINISH:
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_RESET_ALL:
+ return false;
+ default:
+ break;
+ }
+
+ /*
+ * When mapped devices use the block layer zone write plugging, we must
+ * split any large BIO to the mapped device limits to not submit BIOs
+ * that span zone boundaries and to avoid potential deadlocks with
+ * queue freeze operations.
+ */
+ return bio_needs_zone_write_plugging(bio) || bio_straddles_zones(bio);
}
+
static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
{
- return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
+ if (!bio_needs_zone_write_plugging(bio))
+ return false;
+ return blk_zone_plug_bio(bio, 0);
}
static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci,
@@ -1849,7 +1857,7 @@ static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci,
* not go crazy with the clone allocation.
*/
alloc_multiple_bios(&blist, ci, ti, min(nr_reset, 32),
- NULL, GFP_NOIO);
+ NULL);
}
/* Get a clone and change it to a regular reset operation. */
@@ -1881,7 +1889,7 @@ static void __send_zone_reset_all_native(struct clone_info *ci,
unsigned int bios;
atomic_add(1, &ci->io->io_count);
- bios = __send_duplicate_bios(ci, ti, 1, NULL, GFP_NOIO);
+ bios = __send_duplicate_bios(ci, ti, 1, NULL);
atomic_sub(1 - bios, &ci->io->io_count);
ci->sector_count = 0;
@@ -1912,8 +1920,7 @@ static blk_status_t __send_zone_reset_all(struct clone_info *ci)
}
#else
-static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
- struct bio *bio)
+static inline bool dm_zone_bio_needs_split(struct bio *bio)
{
return false;
}
@@ -1940,9 +1947,7 @@ static void dm_split_and_process_bio(struct mapped_device *md,
is_abnormal = is_abnormal_io(bio);
if (static_branch_unlikely(&zoned_enabled)) {
- /* Special case REQ_OP_ZONE_RESET_ALL as it cannot be split. */
- need_split = (bio_op(bio) != REQ_OP_ZONE_RESET_ALL) &&
- (is_abnormal || dm_zone_bio_needs_split(md, bio));
+ need_split = is_abnormal || dm_zone_bio_needs_split(bio);
} else {
need_split = is_abnormal;
}
@@ -1969,6 +1974,15 @@ static void dm_split_and_process_bio(struct mapped_device *md,
/* Only support nowait for normal IO */
if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {
+ /*
+ * Don't support NOWAIT for FLUSH because it may allocate
+ * multiple bios and there's no easy way how to undo the
+ * allocations.
+ */
+ if (bio->bi_opf & REQ_PREFLUSH) {
+ bio_wouldblock_error(bio);
+ return;
+ }
io = alloc_io(md, bio, GFP_NOWAIT);
if (unlikely(!io)) {
/* Unable to do anything without dm_io. */
@@ -1980,12 +1994,30 @@ static void dm_split_and_process_bio(struct mapped_device *md,
}
init_clone_info(&ci, io, map, bio, is_abnormal);
- if (bio->bi_opf & REQ_PREFLUSH) {
+ if (unlikely((bio->bi_opf & REQ_PREFLUSH) != 0)) {
+ /*
+ * The "flush_bypasses_map" is set on targets where it is safe
+ * to skip the map function and submit bios directly to the
+ * underlying block devices - currently, it is set for dm-linear
+ * and dm-stripe.
+ *
+ * If we have just one underlying device (i.e. there is one
+ * linear target or multiple linear targets pointing to the same
+ * device), we can send the flush with data directly to it.
+ */
+ if (bio->bi_iter.bi_size && map->flush_bypasses_map) {
+ struct list_head *devices = dm_table_get_devices(map);
+ if (devices->next == devices->prev)
+ goto send_preflush_with_data;
+ }
+ if (bio->bi_iter.bi_size)
+ io->requeue_flush_with_data = true;
__send_empty_flush(&ci);
/* dm_io_complete submits any data associated with flush */
goto out;
}
+send_preflush_with_data:
if (static_branch_unlikely(&zoned_enabled) &&
(bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) {
error = __send_zone_reset_all(&ci);
@@ -2406,21 +2438,35 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
struct queue_limits *limits)
{
struct dm_table *old_map;
- sector_t size;
+ sector_t size, old_size;
int ret;
lockdep_assert_held(&md->suspend_lock);
size = dm_table_get_size(t);
+ old_size = dm_get_size(md);
+
+ if (!dm_table_supports_size_change(t, old_size, size)) {
+ old_map = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ set_capacity(md->disk, size);
+
+ ret = dm_table_set_restrictions(t, md->queue, limits);
+ if (ret) {
+ set_capacity(md->disk, old_size);
+ old_map = ERR_PTR(ret);
+ goto out;
+ }
+
/*
* Wipe any geometry if the size of the table changed.
*/
- if (size != dm_get_size(md))
+ if (size != old_size)
memset(&md->geometry, 0, sizeof(md->geometry));
- set_capacity(md->disk, size);
-
dm_table_event_callback(t, event_callback, md);
if (dm_table_request_based(t)) {
@@ -2438,10 +2484,10 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
* requests in the queue may refer to bio from the old bioset,
* so you must walk through the queue to unprep.
*/
- if (!md->mempools) {
+ if (!md->mempools)
md->mempools = t->mempools;
- t->mempools = NULL;
- }
+ else
+ dm_free_md_mempools(t->mempools);
} else {
/*
* The md may already have mempools that need changing.
@@ -2450,14 +2496,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
*/
dm_free_md_mempools(md->mempools);
md->mempools = t->mempools;
- t->mempools = NULL;
- }
-
- ret = dm_table_set_restrictions(t, md->queue, limits);
- if (ret) {
- old_map = ERR_PTR(ret);
- goto out;
}
+ t->mempools = NULL;
old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
rcu_assign_pointer(md->map, (void *)t);
@@ -2884,7 +2924,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
{
bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
- int r;
+ int r = 0;
lockdep_assert_held(&md->suspend_lock);
@@ -2936,8 +2976,10 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
* Stop md->queue before flushing md->wq in case request-based
* dm defers requests to md->wq from md->queue.
*/
- if (dm_request_based(md))
+ if (map && dm_request_based(md)) {
dm_stop_queue(md->queue);
+ set_bit(DMF_QUEUE_STOPPED, &md->flags);
+ }
flush_workqueue(md->wq);
@@ -2946,7 +2988,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
* We call dm_wait_for_completion to wait for all existing requests
* to finish.
*/
- r = dm_wait_for_completion(md, task_state);
+ if (map)
+ r = dm_wait_for_completion(md, task_state);
if (!r)
set_bit(dmf_suspended_flag, &md->flags);
@@ -2959,7 +3002,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
if (r < 0) {
dm_queue_flush(md);
- if (dm_request_based(md))
+ if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags))
dm_start_queue(md->queue);
unlock_fs(md);
@@ -3043,7 +3086,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map)
* so that mapping of targets can work correctly.
* Request-based dm is queueing the deferred I/Os in its request_queue.
*/
- if (dm_request_based(md))
+ if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags))
dm_start_queue(md->queue);
unlock_fs(md);
@@ -3623,10 +3666,13 @@ static int dm_pr_clear(struct block_device *bdev, u64 key)
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
int r, srcu_idx;
+ bool forward = true;
- r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
+ /* Not a real ioctl, but targets must not interpret non-DM ioctls */
+ r = dm_prepare_ioctl(md, &srcu_idx, &bdev, 0, 0, &forward);
if (r < 0)
goto out;
+ WARN_ON_ONCE(!forward);
ops = bdev->bd_disk->fops->pr_ops;
if (ops && ops->pr_clear)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a0a8ff119815..7a795979ec72 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -58,6 +58,7 @@ void dm_table_event_callback(struct dm_table *t,
void (*fn)(void *), void *context);
struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
bool dm_table_has_no_data_devices(struct dm_table *table);
+bool dm_table_is_wildcard(struct dm_table *t);
int dm_calculate_queue_limits(struct dm_table *table,
struct queue_limits *limits);
int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -72,6 +73,8 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
+bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size,
+ sector_t new_size);
void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
@@ -102,20 +105,24 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *lim);
int dm_revalidate_zones(struct dm_table *t, struct request_queue *q);
+void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim);
void dm_zone_endio(struct dm_io *io, struct bio *clone);
#ifdef CONFIG_BLK_DEV_ZONED
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data);
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args);
bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
sector_t sector, unsigned int nr_zones,
unsigned long *need_reset);
+#define dm_has_zone_plugs(md) ((md)->disk->zone_wplugs_hash != NULL)
#else
#define dm_blk_report_zones NULL
static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
{
return false;
}
+#define dm_has_zone_plugs(md) false
#endif
/*
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index b2a00f213c2c..4b80165afd23 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -49,6 +49,7 @@ static int md_setup_ents __initdata;
* instead of just one. -- KTK
* 18May2000: Added support for persistent-superblock arrays:
* md=n,0,factor,fault,device-list uses RAID0 for device n
+ * md=n,-1,factor,fault,device-list uses LINEAR for device n
* md=n,device-list reads a RAID superblock from the devices
* elements in device-list are read by name_to_kdev_t so can be
* a hex number or something like /dev/hda1 /dev/sdb
@@ -87,7 +88,7 @@ static int __init md_setup(char *str)
md_setup_ents++;
switch (get_option(&str, &level)) { /* RAID level */
case 2: /* could be 0 or -1.. */
- if (level == 0) {
+ if (level == 0 || level == LEVEL_LINEAR) {
if (get_option(&str, &factor) != 2 || /* Chunk Size */
get_option(&str, &fault) != 2) {
printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
@@ -95,7 +96,10 @@ static int __init md_setup(char *str)
}
md_setup_args[ent].level = level;
md_setup_args[ent].chunk = 1 << (factor+12);
- pername = "raid0";
+ if (level == LEVEL_LINEAR)
+ pername = "linear";
+ else
+ pername = "raid0";
break;
}
fallthrough;
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index c3a42dd66ce5..84b7e2af6dba 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -29,17 +29,10 @@
#include <linux/buffer_head.h>
#include <linux/seq_file.h>
#include <trace/events/block.h>
+
#include "md.h"
#include "md-bitmap.h"
-
-#define BITMAP_MAJOR_LO 3
-/* version 4 insists the bitmap is in little-endian order
- * with version 3, it is host-endian which is non-portable
- * Version 5 is currently set only for clustered devices
- */
-#define BITMAP_MAJOR_HI 4
-#define BITMAP_MAJOR_CLUSTERED 5
-#define BITMAP_MAJOR_HOSTENDIAN 3
+#include "md-cluster.h"
/*
* in-memory bitmap:
@@ -103,9 +96,19 @@
*
*/
+typedef __u16 bitmap_counter_t;
+
#define PAGE_BITS (PAGE_SIZE << 3)
#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+#define COUNTER_BITS 16
+#define COUNTER_BIT_SHIFT 4
+#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
+
+#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
+#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
+#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
+
#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
@@ -212,6 +215,8 @@ struct bitmap {
int cluster_slot;
};
+static struct workqueue_struct *md_bitmap_wq;
+
static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, bool init);
@@ -220,20 +225,19 @@ static inline char *bmname(struct bitmap *bitmap)
return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
}
-static bool __bitmap_enabled(struct bitmap *bitmap)
+static bool bitmap_enabled(void *data, bool flush)
{
- return bitmap->storage.filemap &&
- !test_bit(BITMAP_STALE, &bitmap->flags);
-}
-
-static bool bitmap_enabled(struct mddev *mddev)
-{
- struct bitmap *bitmap = mddev->bitmap;
+ struct bitmap *bitmap = data;
- if (!bitmap)
- return false;
+ if (!flush)
+ return true;
- return __bitmap_enabled(bitmap);
+ /*
+ * If caller want to flush bitmap pages to underlying disks, check if
+ * there are cached pages in filemap.
+ */
+ return !test_bit(BITMAP_STALE, &bitmap->flags) &&
+ bitmap->storage.filemap != NULL;
}
/*
@@ -426,8 +430,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
struct block_device *bdev;
struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage;
- unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) <<
- PAGE_SHIFT;
+ unsigned long num_pages = bitmap->storage.file_pages;
+ unsigned int bitmap_limit = (num_pages - pg_index % num_pages) << PAGE_SHIFT;
loff_t sboff, offset = mddev->bitmap_info.offset;
sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
unsigned int size = PAGE_SIZE;
@@ -436,7 +440,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
/* we compare length (page numbers), not page offset. */
- if ((pg_index - store->sb_index) == store->file_pages - 1) {
+ if ((pg_index - store->sb_index) == num_pages - 1) {
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
if (last_page_size == 0)
@@ -472,7 +476,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
return -EINVAL;
}
- md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page);
+ md_write_metadata(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit),
+ page, 0);
return 0;
}
@@ -682,7 +687,7 @@ static void bitmap_update_sb(void *data)
return;
if (!bitmap->storage.sb_page) /* no superblock */
return;
- sb = kmap_atomic(bitmap->storage.sb_page);
+ sb = kmap_local_page(bitmap->storage.sb_page);
sb->events = cpu_to_le64(bitmap->mddev->events);
if (bitmap->mddev->events < bitmap->events_cleared)
/* rocking back to read-only */
@@ -702,7 +707,7 @@ static void bitmap_update_sb(void *data)
sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
bitmap_info.space);
- kunmap_atomic(sb);
+ kunmap_local(sb);
if (bitmap->storage.file)
write_file_page(bitmap, bitmap->storage.sb_page, 1);
@@ -717,7 +722,7 @@ static void bitmap_print_sb(struct bitmap *bitmap)
if (!bitmap || !bitmap->storage.sb_page)
return;
- sb = kmap_atomic(bitmap->storage.sb_page);
+ sb = kmap_local_page(bitmap->storage.sb_page);
pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
pr_debug(" version: %u\n", le32_to_cpu(sb->version));
@@ -736,7 +741,7 @@ static void bitmap_print_sb(struct bitmap *bitmap)
pr_debug(" sync size: %llu KB\n",
(unsigned long long)le64_to_cpu(sb->sync_size)/2);
pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind));
- kunmap_atomic(sb);
+ kunmap_local(sb);
}
/*
@@ -760,7 +765,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
return -ENOMEM;
bitmap->storage.sb_index = 0;
- sb = kmap_atomic(bitmap->storage.sb_page);
+ sb = kmap_local_page(bitmap->storage.sb_page);
sb->magic = cpu_to_le32(BITMAP_MAGIC);
sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
@@ -768,7 +773,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
chunksize = bitmap->mddev->bitmap_info.chunksize;
BUG_ON(!chunksize);
if (!is_power_of_2(chunksize)) {
- kunmap_atomic(sb);
+ kunmap_local(sb);
pr_warn("bitmap chunksize not a power of 2\n");
return -EINVAL;
}
@@ -787,7 +792,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
* is a good choice? We choose COUNTER_MAX / 2 arbitrarily.
*/
write_behind = bitmap->mddev->bitmap_info.max_write_behind;
- if (write_behind > COUNTER_MAX)
+ if (write_behind > COUNTER_MAX / 2)
write_behind = COUNTER_MAX / 2;
sb->write_behind = cpu_to_le32(write_behind);
bitmap->mddev->bitmap_info.max_write_behind = write_behind;
@@ -803,7 +808,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
bitmap->mddev->bitmap_info.nodes = 0;
- kunmap_atomic(sb);
+ kunmap_local(sb);
return 0;
}
@@ -865,7 +870,7 @@ re_read:
return err;
err = -EINVAL;
- sb = kmap_atomic(sb_page);
+ sb = kmap_local_page(sb_page);
chunksize = le32_to_cpu(sb->chunksize);
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
@@ -932,7 +937,7 @@ re_read:
err = 0;
out:
- kunmap_atomic(sb);
+ kunmap_local(sb);
if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
/* Assigning chunksize is required for "re_read" */
bitmap->mddev->bitmap_info.chunksize = chunksize;
@@ -942,7 +947,7 @@ out:
bmname(bitmap), err);
goto out_no_sb;
}
- bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
+ bitmap->cluster_slot = bitmap->mddev->cluster_ops->slot_number(bitmap->mddev);
goto re_read;
}
@@ -1161,12 +1166,12 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
bit = file_page_offset(&bitmap->storage, chunk);
/* set the bit */
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
set_bit(bit, kaddr);
else
set_bit_le(bit, kaddr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
pr_debug("set file bit %lu page %lu\n", bit, index);
/* record page number so it gets flushed to disk when unplug occurs */
set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
@@ -1190,12 +1195,12 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
if (!page)
return;
bit = file_page_offset(&bitmap->storage, chunk);
- paddr = kmap_atomic(page);
+ paddr = kmap_local_page(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
clear_bit(bit, paddr);
else
clear_bit_le(bit, paddr);
- kunmap_atomic(paddr);
+ kunmap_local(paddr);
if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
@@ -1214,12 +1219,12 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
if (!page)
return -EINVAL;
bit = file_page_offset(&bitmap->storage, chunk);
- paddr = kmap_atomic(page);
+ paddr = kmap_local_page(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
set = test_bit(bit, paddr);
else
set = test_bit_le(bit, paddr);
- kunmap_atomic(paddr);
+ kunmap_local(paddr);
return set;
}
@@ -1232,7 +1237,7 @@ static void __bitmap_unplug(struct bitmap *bitmap)
int dirty, need_write;
int writing = 0;
- if (!__bitmap_enabled(bitmap))
+ if (!bitmap_enabled(bitmap, true))
return;
/* look at each page to see if there are any set bits that need to be
@@ -1388,9 +1393,9 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
* If the bitmap is out of date, dirty the whole page
* and write it out
*/
- paddr = kmap_atomic(page);
+ paddr = kmap_local_page(page);
memset(paddr + offset, 0xff, PAGE_SIZE - offset);
- kunmap_atomic(paddr);
+ kunmap_local(paddr);
filemap_write_page(bitmap, i, true);
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
@@ -1406,12 +1411,12 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
void *paddr;
bool was_set;
- paddr = kmap_atomic(page);
+ paddr = kmap_local_page(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
was_set = test_bit(bit, paddr);
else
was_set = test_bit_le(bit, paddr);
- kunmap_atomic(paddr);
+ kunmap_local(paddr);
if (was_set) {
/* if the disk bit is set, set the memory bit */
@@ -1546,10 +1551,10 @@ static void bitmap_daemon_work(struct mddev *mddev)
bitmap_super_t *sb;
bitmap->need_sync = 0;
if (bitmap->storage.filemap) {
- sb = kmap_atomic(bitmap->storage.sb_page);
+ sb = kmap_local_page(bitmap->storage.sb_page);
sb->events_cleared =
cpu_to_le64(bitmap->events_cleared);
- kunmap_atomic(sb);
+ kunmap_local(sb);
set_page_attr(bitmap, 0,
BITMAP_PAGE_NEEDWRITE);
}
@@ -1670,24 +1675,13 @@ __acquires(bitmap->lock)
&(bitmap->bp[page].map[pageoff]);
}
-static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool behind)
+static void bitmap_start_write(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
- return 0;
-
- if (behind) {
- int bw;
- atomic_inc(&bitmap->behind_writes);
- bw = atomic_read(&bitmap->behind_writes);
- if (bw > bitmap->behind_writes_used)
- bitmap->behind_writes_used = bw;
-
- pr_debug("inc write-behind count %d/%lu\n",
- bw, bitmap->mddev->bitmap_info.max_write_behind);
- }
+ return;
while (sectors) {
sector_t blocks;
@@ -1697,7 +1691,7 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
if (!bmc) {
spin_unlock_irq(&bitmap->counts.lock);
- return 0;
+ return;
}
if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) {
@@ -1733,25 +1727,16 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
else
sectors = 0;
}
- return 0;
}
-static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool success, bool behind)
+static void bitmap_end_write(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return;
- if (behind) {
- if (atomic_dec_and_test(&bitmap->behind_writes))
- wake_up(&bitmap->behind_wait);
- pr_debug("dec write-behind count %d/%lu\n",
- atomic_read(&bitmap->behind_writes),
- bitmap->mddev->bitmap_info.max_write_behind);
- }
-
while (sectors) {
sector_t blocks;
unsigned long flags;
@@ -1764,15 +1749,16 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
return;
}
- if (success && !bitmap->mddev->degraded &&
- bitmap->events_cleared < bitmap->mddev->events) {
- bitmap->events_cleared = bitmap->mddev->events;
- bitmap->need_sync = 1;
- sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
- }
-
- if (!success && !NEEDED(*bmc))
+ if (!bitmap->mddev->degraded) {
+ if (bitmap->events_cleared < bitmap->mddev->events) {
+ bitmap->events_cleared = bitmap->mddev->events;
+ bitmap->need_sync = 1;
+ sysfs_notify_dirent_safe(
+ bitmap->sysfs_can_clear);
+ }
+ } else if (!NEEDED(*bmc)) {
*bmc |= NEEDED_MASK;
+ }
if (COUNTER(*bmc) == COUNTER_MAX)
wake_up(&bitmap->overflow_wait);
@@ -1795,15 +1781,9 @@ static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
sector_t *blocks, bool degraded)
{
bitmap_counter_t *bmc;
- bool rv;
+ bool rv = false;
- if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
- *blocks = 1024;
- return true; /* always resync if no bitmap */
- }
spin_lock_irq(&bitmap->counts.lock);
-
- rv = false;
bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
if (bmc) {
/* locked */
@@ -1852,10 +1832,6 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
bitmap_counter_t *bmc;
unsigned long flags;
- if (bitmap == NULL) {
- *blocks = 1024;
- return;
- }
spin_lock_irqsave(&bitmap->counts.lock, flags);
bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
if (bmc == NULL)
@@ -1994,12 +1970,12 @@ static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s,
md_bitmap_set_memory_bits(bitmap, sec, 1);
md_bitmap_file_set_bit(bitmap, sec);
- if (sec < bitmap->mddev->recovery_cp)
+ if (sec < bitmap->mddev->resync_offset)
/* We are asserting that the array is dirty,
- * so move the recovery_cp address back so
+ * so move the resync_offset address back so
* that it is obvious that it is dirty
*/
- bitmap->mddev->recovery_cp = sec;
+ bitmap->mddev->resync_offset = sec;
}
}
@@ -2039,7 +2015,7 @@ static void md_bitmap_free(void *data)
sysfs_put(bitmap->sysfs_can_clear);
if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
- bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev))
+ bitmap->cluster_slot == bitmap->mddev->cluster_ops->slot_number(bitmap->mddev))
md_cluster_stop(bitmap->mddev);
/* Shouldn't be needed - but just in case.... */
@@ -2062,6 +2038,31 @@ static void md_bitmap_free(void *data)
kfree(bitmap);
}
+static void bitmap_start_behind_write(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ int bw;
+
+ atomic_inc(&bitmap->behind_writes);
+ bw = atomic_read(&bitmap->behind_writes);
+ if (bw > bitmap->behind_writes_used)
+ bitmap->behind_writes_used = bw;
+
+ pr_debug("inc write-behind count %d/%lu\n",
+ bw, bitmap->mddev->bitmap_info.max_write_behind);
+}
+
+static void bitmap_end_behind_write(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (atomic_dec_and_test(&bitmap->behind_writes))
+ wake_up(&bitmap->behind_wait);
+ pr_debug("dec write-behind count %d/%lu\n",
+ atomic_read(&bitmap->behind_writes),
+ bitmap->mddev->bitmap_info.max_write_behind);
+}
+
static void bitmap_wait_behind_writes(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
@@ -2190,9 +2191,9 @@ static struct bitmap *__bitmap_create(struct mddev *mddev, int slot)
return ERR_PTR(err);
}
-static int bitmap_create(struct mddev *mddev, int slot)
+static int bitmap_create(struct mddev *mddev)
{
- struct bitmap *bitmap = __bitmap_create(mddev, slot);
+ struct bitmap *bitmap = __bitmap_create(mddev, -1);
if (IS_ERR(bitmap))
return PTR_ERR(bitmap);
@@ -2216,7 +2217,7 @@ static int bitmap_load(struct mddev *mddev)
mddev_create_serial_pool(mddev, rdev);
if (mddev_is_clustered(mddev))
- md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
+ mddev->cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
/* Clear out old bitmap info first: Either there is none, or we
* are resuming after someone else has possibly changed things,
@@ -2234,7 +2235,7 @@ static int bitmap_load(struct mddev *mddev)
|| bitmap->events_cleared == mddev->events)
/* no need to keep dirty bits to optimise a
* re-add of a missing device */
- start = mddev->recovery_cp;
+ start = mddev->resync_offset;
mutex_lock(&mddev->bitmap_info.mutex);
err = md_bitmap_init_from_disk(bitmap, start);
@@ -2342,7 +2343,8 @@ static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
if (!bitmap)
return -ENOENT;
-
+ if (!bitmap->storage.sb_page)
+ return -EINVAL;
sb = kmap_local_page(bitmap->storage.sb_page);
stats->sync_size = le64_to_cpu(sb->sync_size);
kunmap_local(sb);
@@ -2568,15 +2570,14 @@ err:
return ret;
}
-static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize,
- bool init)
+static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return 0;
- return __bitmap_resize(bitmap, blocks, chunksize, init);
+ return __bitmap_resize(bitmap, blocks, chunksize, false);
}
static ssize_t
@@ -2653,7 +2654,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
}
mddev->bitmap_info.offset = offset;
- rv = bitmap_create(mddev, -1);
+ rv = bitmap_create(mddev);
if (rv)
goto out;
@@ -2965,12 +2966,19 @@ static struct attribute *md_bitmap_attrs[] = {
&max_backlog_used.attr,
NULL
};
-const struct attribute_group md_bitmap_group = {
+
+static struct attribute_group md_bitmap_group = {
.name = "bitmap",
.attrs = md_bitmap_attrs,
};
static struct bitmap_operations bitmap_ops = {
+ .head = {
+ .type = MD_BITMAP,
+ .id = ID_BITMAP,
+ .name = "bitmap",
+ },
+
.enabled = bitmap_enabled,
.create = bitmap_create,
.resize = bitmap_resize,
@@ -2981,10 +2989,16 @@ static struct bitmap_operations bitmap_ops = {
.dirty_bits = bitmap_dirty_bits,
.unplug = bitmap_unplug,
.daemon_work = bitmap_daemon_work,
+
+ .start_behind_write = bitmap_start_behind_write,
+ .end_behind_write = bitmap_end_behind_write,
.wait_behind_writes = bitmap_wait_behind_writes,
- .startwrite = bitmap_startwrite,
- .endwrite = bitmap_endwrite,
+ .start_write = bitmap_start_write,
+ .end_write = bitmap_end_write,
+ .start_discard = bitmap_start_write,
+ .end_discard = bitmap_end_write,
+
.start_sync = bitmap_start_sync,
.end_sync = bitmap_end_sync,
.cond_end_sync = bitmap_cond_end_sync,
@@ -2998,9 +3012,22 @@ static struct bitmap_operations bitmap_ops = {
.copy_from_slot = bitmap_copy_from_slot,
.set_pages = bitmap_set_pages,
.free = md_bitmap_free,
+
+ .group = &md_bitmap_group,
};
-void mddev_set_bitmap_ops(struct mddev *mddev)
+int md_bitmap_init(void)
+{
+ md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
+ 0);
+ if (!md_bitmap_wq)
+ return -ENOMEM;
+
+ return register_md_submodule(&bitmap_ops.head);
+}
+
+void md_bitmap_exit(void)
{
- mddev->bitmap_ops = &bitmap_ops;
+ destroy_workqueue(md_bitmap_wq);
+ unregister_md_submodule(&bitmap_ops.head);
}
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index 662e6fc141a7..b42a28fa83a0 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -9,19 +9,26 @@
#define BITMAP_MAGIC 0x6d746962
-typedef __u16 bitmap_counter_t;
-#define COUNTER_BITS 16
-#define COUNTER_BIT_SHIFT 4
-#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
-
-#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
-#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
-#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
+/*
+ * version 3 is host-endian order, this is deprecated and not used for new
+ * array
+ */
+#define BITMAP_MAJOR_LO 3
+#define BITMAP_MAJOR_HOSTENDIAN 3
+/* version 4 is little-endian order, the default value */
+#define BITMAP_MAJOR_HI 4
+/* version 5 is only used for cluster */
+#define BITMAP_MAJOR_CLUSTERED 5
+/* version 6 is only used for lockless bitmap */
+#define BITMAP_MAJOR_LOCKLESS 6
/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
- BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
+ BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
BITMAP_WRITE_ERROR = 2, /* A write error has occurred */
+ BITMAP_FIRST_USE = 3, /* llbitmap is just created */
+ BITMAP_CLEAN = 4, /* llbitmap is created with assume_clean */
+ BITMAP_DAEMON_BUSY = 5, /* llbitmap daemon is not finished after daemon_sleep */
BITMAP_HOSTENDIAN =15,
};
@@ -70,11 +77,15 @@ struct md_bitmap_stats {
struct file *file;
};
+typedef void (md_bitmap_fn)(struct mddev *mddev, sector_t offset,
+ unsigned long sectors);
+
struct bitmap_operations {
- bool (*enabled)(struct mddev *mddev);
- int (*create)(struct mddev *mddev, int slot);
- int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize,
- bool init);
+ struct md_submodule_head head;
+
+ bool (*enabled)(void *data, bool flush);
+ int (*create)(struct mddev *mddev);
+ int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize);
int (*load)(struct mddev *mddev);
void (*destroy)(struct mddev *mddev);
@@ -84,12 +95,18 @@ struct bitmap_operations {
unsigned long e);
void (*unplug)(struct mddev *mddev, bool sync);
void (*daemon_work)(struct mddev *mddev);
+
+ void (*start_behind_write)(struct mddev *mddev);
+ void (*end_behind_write)(struct mddev *mddev);
void (*wait_behind_writes)(struct mddev *mddev);
- int (*startwrite)(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool behind);
- void (*endwrite)(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool success, bool behind);
+ md_bitmap_fn *start_write;
+ md_bitmap_fn *end_write;
+ md_bitmap_fn *start_discard;
+ md_bitmap_fn *end_discard;
+
+ sector_t (*skip_sync_blocks)(struct mddev *mddev, sector_t offset);
+ bool (*blocks_synced)(struct mddev *mddev, sector_t offset);
bool (*start_sync)(struct mddev *mddev, sector_t offset,
sector_t *blocks, bool degraded);
void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
@@ -107,9 +124,75 @@ struct bitmap_operations {
sector_t *hi, bool clear_bits);
void (*set_pages)(void *data, unsigned long pages);
void (*free)(void *data);
+
+ struct attribute_group *group;
};
/* the bitmap API */
-void mddev_set_bitmap_ops(struct mddev *mddev);
+static inline bool md_bitmap_registered(struct mddev *mddev)
+{
+ return mddev->bitmap_ops != NULL;
+}
+
+static inline bool md_bitmap_enabled(struct mddev *mddev, bool flush)
+{
+ /* bitmap_ops must be registered before creating bitmap. */
+ if (!md_bitmap_registered(mddev))
+ return false;
+
+ if (!mddev->bitmap)
+ return false;
+
+ return mddev->bitmap_ops->enabled(mddev->bitmap, flush);
+}
+
+static inline bool md_bitmap_start_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks, bool degraded)
+{
+ /* always resync if no bitmap */
+ if (!md_bitmap_enabled(mddev, false)) {
+ *blocks = 1024;
+ return true;
+ }
+
+ return mddev->bitmap_ops->start_sync(mddev, offset, blocks, degraded);
+}
+
+static inline void md_bitmap_end_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks)
+{
+ if (!md_bitmap_enabled(mddev, false)) {
+ *blocks = 1024;
+ return;
+ }
+
+ mddev->bitmap_ops->end_sync(mddev, offset, blocks);
+}
+
+#ifdef CONFIG_MD_BITMAP
+int md_bitmap_init(void);
+void md_bitmap_exit(void);
+#else
+static inline int md_bitmap_init(void)
+{
+ return 0;
+}
+static inline void md_bitmap_exit(void)
+{
+}
+#endif
+
+#ifdef CONFIG_MD_LLBITMAP
+int md_llbitmap_init(void);
+void md_llbitmap_exit(void);
+#else
+static inline int md_llbitmap_init(void)
+{
+ return 0;
+}
+static inline void md_llbitmap_exit(void)
+{
+}
+#endif
#endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 6595f89becdb..11f1e91d387d 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -337,11 +337,11 @@ static void recover_bitmaps(struct md_thread *thread)
md_wakeup_thread(mddev->sync_thread);
if (hi > 0) {
- if (lo < mddev->recovery_cp)
- mddev->recovery_cp = lo;
+ if (lo < mddev->resync_offset)
+ mddev->resync_offset = lo;
/* wake up thread to continue resync in case resync
* is not finished */
- if (mddev->recovery_cp != MaxSector) {
+ if (mddev->resync_offset != MaxSector) {
/*
* clear the REMOTE flag since we will launch
* resync thread in current node.
@@ -630,7 +630,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
ret = mddev->bitmap_ops->resize(mddev,
le64_to_cpu(msg->high),
- 0, false);
+ 0);
break;
default:
ret = -1;
@@ -863,9 +863,9 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
lockres_free(bm_lockres);
continue;
}
- if ((hi > 0) && (lo < mddev->recovery_cp)) {
+ if ((hi > 0) && (lo < mddev->resync_offset)) {
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- mddev->recovery_cp = lo;
+ mddev->resync_offset = lo;
md_check_recovery(mddev);
}
@@ -979,7 +979,7 @@ err:
lockres_free(cinfo->resync_lockres);
lockres_free(cinfo->bitmap_lockres);
if (cinfo->lockspace)
- dlm_release_lockspace(cinfo->lockspace, 2);
+ dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL);
mddev->cluster_info = NULL;
kfree(cinfo);
return ret;
@@ -1027,7 +1027,7 @@ static int leave(struct mddev *mddev)
* Also, we should send BITMAP_NEEDS_SYNC message in
* case reshaping is interrupted.
*/
- if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) ||
+ if ((cinfo->slot_number > 0 && mddev->resync_offset != MaxSector) ||
(mddev->reshape_position != MaxSector &&
test_bit(MD_CLOSING, &mddev->flags)))
resync_bitmap(mddev);
@@ -1042,7 +1042,7 @@ static int leave(struct mddev *mddev)
lockres_free(cinfo->resync_lockres);
lockres_free(cinfo->bitmap_lockres);
unlock_all_bitmaps(mddev);
- dlm_release_lockspace(cinfo->lockspace, 2);
+ dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL);
kfree(cinfo);
return 0;
}
@@ -1166,7 +1166,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
struct dlm_lock_resource *bm_lockres;
char str[64];
- if (i == md_cluster_ops->slot_number(mddev))
+ if (i == slot_number(mddev))
continue;
bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
@@ -1216,7 +1216,7 @@ out:
*/
static int cluster_check_sync_size(struct mddev *mddev)
{
- int current_slot = md_cluster_ops->slot_number(mddev);
+ int current_slot = slot_number(mddev);
int node_num = mddev->bitmap_info.nodes;
struct dlm_lock_resource *bm_lockres;
struct md_bitmap_stats stats;
@@ -1605,14 +1605,21 @@ static int gather_bitmaps(struct md_rdev *rdev)
pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
goto out;
}
- if ((hi > 0) && (lo < mddev->recovery_cp))
- mddev->recovery_cp = lo;
+ if ((hi > 0) && (lo < mddev->resync_offset))
+ mddev->resync_offset = lo;
}
out:
return err;
}
-static const struct md_cluster_operations cluster_ops = {
+static struct md_cluster_operations cluster_ops = {
+ .head = {
+ .type = MD_CLUSTER,
+ .id = ID_CLUSTER,
+ .name = "cluster",
+ .owner = THIS_MODULE,
+ },
+
.join = join,
.leave = leave,
.slot_number = slot_number,
@@ -1642,13 +1649,12 @@ static int __init cluster_init(void)
{
pr_warn("md-cluster: support raid1 and raid10 (limited support)\n");
pr_info("Registering Cluster MD functions\n");
- register_md_cluster_operations(&cluster_ops, THIS_MODULE);
- return 0;
+ return register_md_submodule(&cluster_ops.head);
}
static void cluster_exit(void)
{
- unregister_md_cluster_operations();
+ unregister_md_submodule(&cluster_ops.head);
}
module_init(cluster_init);
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 470bf18ffde5..8fb06d853173 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -10,6 +10,8 @@ struct mddev;
struct md_rdev;
struct md_cluster_operations {
+ struct md_submodule_head head;
+
int (*join)(struct mddev *mddev, int nodes);
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
@@ -35,4 +37,8 @@ struct md_cluster_operations {
void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
};
+extern int md_setup_cluster(struct mddev *mddev, int nodes);
+extern void md_cluster_stop(struct mddev *mddev);
+extern void md_reload_sb(struct mddev *mddev, int raid_disk);
+
#endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
new file mode 100644
index 000000000000..8d7b82c4a723
--- /dev/null
+++ b/drivers/md/md-linear.c
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc
+ * ZYNGIER <zyngier@ufr-info-p7.ibp.fr> or <maz@gloups.fdn.fr>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <trace/events/block.h>
+#include "md.h"
+
+struct dev_info {
+ struct md_rdev *rdev;
+ sector_t end_sector;
+};
+
+struct linear_conf {
+ struct rcu_head rcu;
+ sector_t array_sectors;
+ /* a copy of mddev->raid_disks */
+ int raid_disks;
+ struct dev_info disks[] __counted_by(raid_disks);
+};
+
+/*
+ * find which device holds a particular offset
+ */
+static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
+{
+ int lo, mid, hi;
+ struct linear_conf *conf;
+
+ lo = 0;
+ hi = mddev->raid_disks - 1;
+ conf = mddev->private;
+
+ /*
+ * Binary Search
+ */
+
+ while (hi > lo) {
+
+ mid = (hi + lo) / 2;
+ if (sector < conf->disks[mid].end_sector)
+ hi = mid;
+ else
+ lo = mid + 1;
+ }
+
+ return conf->disks + lo;
+}
+
+static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+{
+ struct linear_conf *conf;
+ sector_t array_sectors;
+
+ conf = mddev->private;
+ WARN_ONCE(sectors || raid_disks,
+ "%s does not support generic reshape\n", __func__);
+ array_sectors = conf->array_sectors;
+
+ return array_sectors;
+}
+
+static int linear_set_limits(struct mddev *mddev)
+{
+ struct queue_limits lim;
+ int err;
+
+ md_init_stacking_limits(&lim);
+ lim.max_hw_sectors = mddev->chunk_sectors;
+ lim.logical_block_size = mddev->logical_block_size;
+ lim.max_write_zeroes_sectors = mddev->chunk_sectors;
+ lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors;
+ lim.io_min = mddev->chunk_sectors << 9;
+ lim.features |= BLK_FEAT_ATOMIC_WRITES;
+ err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
+ if (err)
+ return err;
+
+ return queue_limits_set(mddev->gendisk->queue, &lim);
+}
+
+static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
+{
+ struct linear_conf *conf;
+ struct md_rdev *rdev;
+ int ret = -EINVAL;
+ int cnt;
+ int i;
+
+ conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
+ if (!conf)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * conf->raid_disks is copy of mddev->raid_disks. The reason to
+ * keep a copy of mddev->raid_disks in struct linear_conf is,
+ * mddev->raid_disks may not be consistent with pointers number of
+ * conf->disks[] when it is updated in linear_add() and used to
+ * iterate old conf->disks[] earray in linear_congested().
+ * Here conf->raid_disks is always consitent with number of
+ * pointers in conf->disks[] array, and mddev->private is updated
+ * with rcu_assign_pointer() in linear_addr(), such race can be
+ * avoided.
+ */
+ conf->raid_disks = raid_disks;
+
+ cnt = 0;
+ conf->array_sectors = 0;
+
+ rdev_for_each(rdev, mddev) {
+ int j = rdev->raid_disk;
+ struct dev_info *disk = conf->disks + j;
+ sector_t sectors;
+
+ if (j < 0 || j >= raid_disks || disk->rdev) {
+ pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
+ mdname(mddev));
+ goto out;
+ }
+
+ disk->rdev = rdev;
+ if (mddev->chunk_sectors) {
+ sectors = rdev->sectors;
+ sector_div(sectors, mddev->chunk_sectors);
+ rdev->sectors = sectors * mddev->chunk_sectors;
+ }
+
+ conf->array_sectors += rdev->sectors;
+ cnt++;
+ }
+ if (cnt != raid_disks) {
+ pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
+ mdname(mddev));
+ goto out;
+ }
+
+ /*
+ * Here we calculate the device offsets.
+ */
+ conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
+
+ for (i = 1; i < raid_disks; i++)
+ conf->disks[i].end_sector =
+ conf->disks[i-1].end_sector +
+ conf->disks[i].rdev->sectors;
+
+ if (!mddev_is_dm(mddev)) {
+ ret = linear_set_limits(mddev);
+ if (ret)
+ goto out;
+ }
+
+ return conf;
+
+out:
+ kfree(conf);
+ return ERR_PTR(ret);
+}
+
+static int linear_run(struct mddev *mddev)
+{
+ struct linear_conf *conf;
+ int ret;
+
+ if (md_check_no_bitmap(mddev))
+ return -EINVAL;
+
+ conf = linear_conf(mddev, mddev->raid_disks);
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
+
+ mddev->private = conf;
+ md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+
+ ret = md_integrity_register(mddev);
+ if (ret) {
+ kfree(conf);
+ mddev->private = NULL;
+ }
+ return ret;
+}
+
+static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
+{
+ /* Adding a drive to a linear array allows the array to grow.
+ * It is permitted if the new drive has a matching superblock
+ * already on it, with raid_disk equal to raid_disks.
+ * It is achieved by creating a new linear_private_data structure
+ * and swapping it in in-place of the current one.
+ * The current one is never freed until the array is stopped.
+ * This avoids races.
+ */
+ struct linear_conf *newconf, *oldconf;
+
+ if (rdev->saved_raid_disk != mddev->raid_disks)
+ return -EINVAL;
+
+ rdev->raid_disk = rdev->saved_raid_disk;
+ rdev->saved_raid_disk = -1;
+
+ newconf = linear_conf(mddev, mddev->raid_disks + 1);
+ if (IS_ERR(newconf))
+ return PTR_ERR(newconf);
+
+ /* newconf->raid_disks already keeps a copy of * the increased
+ * value of mddev->raid_disks, WARN_ONCE() is just used to make
+ * sure of this. It is possible that oldconf is still referenced
+ * in linear_congested(), therefore kfree_rcu() is used to free
+ * oldconf until no one uses it anymore.
+ */
+ oldconf = rcu_dereference_protected(mddev->private,
+ lockdep_is_held(&mddev->reconfig_mutex));
+ mddev->raid_disks++;
+ WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
+ "copied raid_disks doesn't match mddev->raid_disks");
+ rcu_assign_pointer(mddev->private, newconf);
+ md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+ set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
+ kfree_rcu(oldconf, rcu);
+ return 0;
+}
+
+static void linear_free(struct mddev *mddev, void *priv)
+{
+ struct linear_conf *conf = priv;
+
+ kfree(conf);
+}
+
+static bool linear_make_request(struct mddev *mddev, struct bio *bio)
+{
+ struct dev_info *tmp_dev;
+ sector_t start_sector, end_sector, data_offset;
+ sector_t bio_sector = bio->bi_iter.bi_sector;
+
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
+ return true;
+
+ tmp_dev = which_dev(mddev, bio_sector);
+ start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+ end_sector = tmp_dev->end_sector;
+ data_offset = tmp_dev->rdev->data_offset;
+
+ if (unlikely(bio_sector >= end_sector ||
+ bio_sector < start_sector))
+ goto out_of_bounds;
+
+ if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
+ md_error(mddev, tmp_dev->rdev);
+ bio_io_error(bio);
+ return true;
+ }
+
+ if (unlikely(bio_end_sector(bio) > end_sector)) {
+ /* This bio crosses a device boundary, so we have to split it */
+ bio = bio_submit_split_bioset(bio, end_sector - bio_sector,
+ &mddev->bio_set);
+ if (!bio)
+ return true;
+ }
+
+ md_account_bio(mddev, &bio);
+ bio_set_dev(bio, tmp_dev->rdev->bdev);
+ bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
+ start_sector + data_offset;
+
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !bdev_max_discard_sectors(bio->bi_bdev))) {
+ /* Just ignore it */
+ bio_endio(bio);
+ } else {
+ if (mddev->gendisk)
+ trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_write_zeroes(mddev, bio);
+ submit_bio_noacct(bio);
+ }
+ return true;
+
+out_of_bounds:
+ pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n",
+ mdname(mddev),
+ (unsigned long long)bio->bi_iter.bi_sector,
+ tmp_dev->rdev->bdev,
+ (unsigned long long)tmp_dev->rdev->sectors,
+ (unsigned long long)start_sector);
+ bio_io_error(bio);
+ return true;
+}
+
+static void linear_status(struct seq_file *seq, struct mddev *mddev)
+{
+ seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
+}
+
+static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
+{
+ if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
+ char *md_name = mdname(mddev);
+
+ pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
+ md_name, rdev->bdev);
+ }
+}
+
+static void linear_quiesce(struct mddev *mddev, int state)
+{
+}
+
+static struct md_personality linear_personality = {
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_LINEAR,
+ .name = "linear",
+ .owner = THIS_MODULE,
+ },
+
+ .make_request = linear_make_request,
+ .run = linear_run,
+ .free = linear_free,
+ .status = linear_status,
+ .hot_add_disk = linear_add,
+ .size = linear_size,
+ .quiesce = linear_quiesce,
+ .error_handler = linear_error,
+};
+
+static int __init linear_init(void)
+{
+ return register_md_submodule(&linear_personality.head);
+}
+
+static void linear_exit(void)
+{
+ unregister_md_submodule(&linear_personality.head);
+}
+
+module_init(linear_init);
+module_exit(linear_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)");
+MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
+MODULE_ALIAS("md-linear");
+MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c
new file mode 100644
index 000000000000..9c1ade19b774
--- /dev/null
+++ b/drivers/md/md-llbitmap.c
@@ -0,0 +1,1626 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <trace/events/block.h>
+
+#include "md.h"
+#include "md-bitmap.h"
+
+/*
+ * #### Background
+ *
+ * Redundant data is used to enhance data fault tolerance, and the storage
+ * methods for redundant data vary depending on the RAID levels. And it's
+ * important to maintain the consistency of redundant data.
+ *
+ * Bitmap is used to record which data blocks have been synchronized and which
+ * ones need to be resynchronized or recovered. Each bit in the bitmap
+ * represents a segment of data in the array. When a bit is set, it indicates
+ * that the multiple redundant copies of that data segment may not be
+ * consistent. Data synchronization can be performed based on the bitmap after
+ * power failure or readding a disk. If there is no bitmap, a full disk
+ * synchronization is required.
+ *
+ * #### Key Features
+ *
+ * - IO fastpath is lockless, if user issues lots of write IO to the same
+ * bitmap bit in a short time, only the first write has additional overhead
+ * to update bitmap bit, no additional overhead for the following writes;
+ * - support only resync or recover written data, means in the case creating
+ * new array or replacing with a new disk, there is no need to do a full disk
+ * resync/recovery;
+ *
+ * #### Key Concept
+ *
+ * ##### State Machine
+ *
+ * Each bit is one byte, contain 6 different states, see llbitmap_state. And
+ * there are total 8 different actions, see llbitmap_action, can change state:
+ *
+ * llbitmap state machine: transitions between states
+ *
+ * | | Startwrite | Startsync | Endsync | Abortsync|
+ * | --------- | ---------- | --------- | ------- | ------- |
+ * | Unwritten | Dirty | x | x | x |
+ * | Clean | Dirty | x | x | x |
+ * | Dirty | x | x | x | x |
+ * | NeedSync | x | Syncing | x | x |
+ * | Syncing | x | Syncing | Dirty | NeedSync |
+ *
+ * | | Reload | Daemon | Discard | Stale |
+ * | --------- | -------- | ------ | --------- | --------- |
+ * | Unwritten | x | x | x | x |
+ * | Clean | x | x | Unwritten | NeedSync |
+ * | Dirty | NeedSync | Clean | Unwritten | NeedSync |
+ * | NeedSync | x | x | Unwritten | x |
+ * | Syncing | NeedSync | x | Unwritten | NeedSync |
+ *
+ * Typical scenarios:
+ *
+ * 1) Create new array
+ * All bits will be set to Unwritten by default, if --assume-clean is set,
+ * all bits will be set to Clean instead.
+ *
+ * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and
+ * rely on xor data
+ *
+ * 2.1) write new data to raid1/raid10:
+ * Unwritten --StartWrite--> Dirty
+ *
+ * 2.2) write new data to raid456:
+ * Unwritten --StartWrite--> NeedSync
+ *
+ * Because the initial recover for raid456 is skipped, the xor data is not built
+ * yet, the bit must be set to NeedSync first and after lazy initial recover is
+ * finished, the bit will finally set to Dirty(see 5.1 and 5.4);
+ *
+ * 2.3) cover write
+ * Clean --StartWrite--> Dirty
+ *
+ * 3) daemon, if the array is not degraded:
+ * Dirty --Daemon--> Clean
+ *
+ * 4) discard
+ * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten
+ *
+ * 5) resync and recover
+ *
+ * 5.1) common process
+ * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean
+ *
+ * 5.2) resync after power failure
+ * Dirty --Reload--> NeedSync
+ *
+ * 5.3) recover while replacing with a new disk
+ * By default, the old bitmap framework will recover all data, and llbitmap
+ * implements this by a new helper, see llbitmap_skip_sync_blocks:
+ *
+ * skip recover for bits other than dirty or clean;
+ *
+ * 5.4) lazy initial recover for raid5:
+ * By default, the old bitmap framework will only allow new recover when there
+ * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added
+ * to perform raid456 lazy recover for set bits(from 2.2).
+ *
+ * 6. special handling for degraded array:
+ *
+ * - Dirty bits will never be cleared, daemon will just do nothing, so that if
+ * a disk is readded, Clean bits can be skipped with recovery;
+ * - Dirty bits will convert to Syncing from start write, to do data recovery
+ * for new added disks;
+ * - New write will convert bits to NeedSync directly;
+ *
+ * ##### Bitmap IO
+ *
+ * ##### Chunksize
+ *
+ * The default bitmap size is 128k, incluing 1k bitmap super block, and
+ * the default size of segment of data in the array each bit(chunksize) is 64k,
+ * and chunksize will adjust to twice the old size each time if the total number
+ * bits is not less than 127k.(see llbitmap_init)
+ *
+ * ##### READ
+ *
+ * While creating bitmap, all pages will be allocated and read for llbitmap,
+ * there won't be read afterwards
+ *
+ * ##### WRITE
+ *
+ * WRITE IO is divided into logical_block_size of the array, the dirty state
+ * of each block is tracked independently, for example:
+ *
+ * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit;
+ *
+ * | page0 | page1 | ... | page 31 |
+ * | |
+ * | \-----------------------\
+ * | |
+ * | block0 | block1 | ... | block 8|
+ * | |
+ * | \-----------------\
+ * | |
+ * | bit0 | bit1 | ... | bit511 |
+ *
+ * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding
+ * subpage will be marked dirty, such block must write first before the IO is
+ * issued. This behaviour will affect IO performance, to reduce the impact, if
+ * multiple bits are changed in the same block in a short time, all bits in this
+ * block will be changed to Dirty/NeedSync, so that there won't be any overhead
+ * until daemon clears dirty bits.
+ *
+ * ##### Dirty Bits synchronization
+ *
+ * IO fast path will set bits to dirty, and those dirty bits will be cleared
+ * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between
+ * IO path and daemon;
+ *
+ * IO path:
+ * 1) try to grab a reference, if succeed, set expire time after 5s and return;
+ * 2) if failed to grab a reference, wait for daemon to finish clearing dirty
+ * bits;
+ *
+ * Daemon (Daemon will be woken up every daemon_sleep seconds):
+ * For each page:
+ * 1) check if page expired, if not skip this page; for expired page:
+ * 2) suspend the page and wait for inflight write IO to be done;
+ * 3) change dirty page to clean;
+ * 4) resume the page;
+ */
+
+#define BITMAP_DATA_OFFSET 1024
+
+/* 64k is the max IO size of sync IO for raid1/raid10 */
+#define MIN_CHUNK_SIZE (64 * 2)
+
+/* By default, daemon will be woken up every 30s */
+#define DEFAULT_DAEMON_SLEEP 30
+
+/*
+ * Dirtied bits that have not been accessed for more than 5s will be cleared
+ * by daemon.
+ */
+#define DEFAULT_BARRIER_IDLE 5
+
+enum llbitmap_state {
+ /* No valid data, init state after assemble the array */
+ BitUnwritten = 0,
+ /* data is consistent */
+ BitClean,
+ /* data will be consistent after IO is done, set directly for writes */
+ BitDirty,
+ /*
+ * data need to be resynchronized:
+ * 1) set directly for writes if array is degraded, prevent full disk
+ * synchronization after readding a disk;
+ * 2) reassemble the array after power failure, and dirty bits are
+ * found after reloading the bitmap;
+ * 3) set for first write for raid5, to build initial xor data lazily
+ */
+ BitNeedSync,
+ /* data is synchronizing */
+ BitSyncing,
+ BitStateCount,
+ BitNone = 0xff,
+};
+
+enum llbitmap_action {
+ /* User write new data, this is the only action from IO fast path */
+ BitmapActionStartwrite = 0,
+ /* Start recovery */
+ BitmapActionStartsync,
+ /* Finish recovery */
+ BitmapActionEndsync,
+ /* Failed recovery */
+ BitmapActionAbortsync,
+ /* Reassemble the array */
+ BitmapActionReload,
+ /* Daemon thread is trying to clear dirty bits */
+ BitmapActionDaemon,
+ /* Data is deleted */
+ BitmapActionDiscard,
+ /*
+ * Bitmap is stale, mark all bits in addition to BitUnwritten to
+ * BitNeedSync.
+ */
+ BitmapActionStale,
+ BitmapActionCount,
+ /* Init state is BitUnwritten */
+ BitmapActionInit,
+};
+
+enum llbitmap_page_state {
+ LLPageFlush = 0,
+ LLPageDirty,
+};
+
+struct llbitmap_page_ctl {
+ char *state;
+ struct page *page;
+ unsigned long expire;
+ unsigned long flags;
+ wait_queue_head_t wait;
+ struct percpu_ref active;
+ /* Per block size dirty state, maximum 64k page / 1 sector = 128 */
+ unsigned long dirty[];
+};
+
+struct llbitmap {
+ struct mddev *mddev;
+ struct llbitmap_page_ctl **pctl;
+
+ unsigned int nr_pages;
+ unsigned int io_size;
+ unsigned int blocks_per_page;
+
+ /* shift of one chunk */
+ unsigned long chunkshift;
+ /* size of one chunk in sector */
+ unsigned long chunksize;
+ /* total number of chunks */
+ unsigned long chunks;
+ unsigned long last_end_sync;
+ /*
+ * time in seconds that dirty bits will be cleared if the page is not
+ * accessed.
+ */
+ unsigned long barrier_idle;
+ /* fires on first BitDirty state */
+ struct timer_list pending_timer;
+ struct work_struct daemon_work;
+
+ unsigned long flags;
+ __u64 events_cleared;
+
+ /* for slow disks */
+ atomic_t behind_writes;
+ wait_queue_head_t behind_wait;
+};
+
+struct llbitmap_unplug_work {
+ struct work_struct work;
+ struct llbitmap *llbitmap;
+ struct completion *done;
+};
+
+static struct workqueue_struct *md_llbitmap_io_wq;
+static struct workqueue_struct *md_llbitmap_unplug_wq;
+
+static char state_machine[BitStateCount][BitmapActionCount] = {
+ [BitUnwritten] = {
+ [BitmapActionStartwrite] = BitDirty,
+ [BitmapActionStartsync] = BitNone,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNone,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitNone,
+ [BitmapActionStale] = BitNone,
+ },
+ [BitClean] = {
+ [BitmapActionStartwrite] = BitDirty,
+ [BitmapActionStartsync] = BitNone,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNone,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNeedSync,
+ },
+ [BitDirty] = {
+ [BitmapActionStartwrite] = BitNone,
+ [BitmapActionStartsync] = BitNone,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNeedSync,
+ [BitmapActionDaemon] = BitClean,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNeedSync,
+ },
+ [BitNeedSync] = {
+ [BitmapActionStartwrite] = BitNone,
+ [BitmapActionStartsync] = BitSyncing,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNone,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNone,
+ },
+ [BitSyncing] = {
+ [BitmapActionStartwrite] = BitNone,
+ [BitmapActionStartsync] = BitSyncing,
+ [BitmapActionEndsync] = BitDirty,
+ [BitmapActionAbortsync] = BitNeedSync,
+ [BitmapActionReload] = BitNeedSync,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNeedSync,
+ },
+};
+
+static void __llbitmap_flush(struct mddev *mddev);
+
+static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos)
+{
+ unsigned int idx;
+ unsigned int offset;
+
+ pos += BITMAP_DATA_OFFSET;
+ idx = pos >> PAGE_SHIFT;
+ offset = offset_in_page(pos);
+
+ return llbitmap->pctl[idx]->state[offset];
+}
+
+/* set all the bits in the subpage as dirty */
+static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
+ struct llbitmap_page_ctl *pctl,
+ unsigned int block)
+{
+ bool level_456 = raid_is_456(llbitmap->mddev);
+ unsigned int io_size = llbitmap->io_size;
+ int pos;
+
+ for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
+ switch (pctl->state[pos]) {
+ case BitUnwritten:
+ pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
+ break;
+ case BitClean:
+ pctl->state[pos] = BitDirty;
+ break;
+ }
+ }
+}
+
+static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
+ int offset)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
+ unsigned int io_size = llbitmap->io_size;
+ int block = offset / io_size;
+ int pos;
+
+ if (!test_bit(LLPageDirty, &pctl->flags))
+ set_bit(LLPageDirty, &pctl->flags);
+
+ /*
+ * For degraded array, dirty bits will never be cleared, and we must
+ * resync all the dirty bits, hence skip infect new dirty bits to
+ * prevent resync unnecessary data.
+ */
+ if (llbitmap->mddev->degraded) {
+ set_bit(block, pctl->dirty);
+ return;
+ }
+
+ /*
+ * The subpage usually contains a total of 512 bits. If any single bit
+ * within the subpage is marked as dirty, the entire sector will be
+ * written. To avoid impacting write performance, when multiple bits
+ * within the same sector are modified within llbitmap->barrier_idle,
+ * all bits in the sector will be collectively marked as dirty at once.
+ */
+ if (test_and_set_bit(block, pctl->dirty)) {
+ llbitmap_infect_dirty_bits(llbitmap, pctl, block);
+ return;
+ }
+
+ for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
+ if (pos == offset)
+ continue;
+ if (pctl->state[pos] == BitDirty ||
+ pctl->state[pos] == BitNeedSync) {
+ llbitmap_infect_dirty_bits(llbitmap, pctl, block);
+ return;
+ }
+ }
+}
+
+static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
+ loff_t pos)
+{
+ unsigned int idx;
+ unsigned int bit;
+
+ pos += BITMAP_DATA_OFFSET;
+ idx = pos >> PAGE_SHIFT;
+ bit = offset_in_page(pos);
+
+ llbitmap->pctl[idx]->state[bit] = state;
+ if (state == BitDirty || state == BitNeedSync)
+ llbitmap_set_page_dirty(llbitmap, idx, bit);
+}
+
+static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ struct page *page = NULL;
+ struct md_rdev *rdev;
+
+ if (llbitmap->pctl && llbitmap->pctl[idx])
+ page = llbitmap->pctl[idx]->page;
+ if (page)
+ return page;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ rdev_for_each(rdev, mddev) {
+ sector_t sector;
+
+ if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+ continue;
+
+ sector = mddev->bitmap_info.offset +
+ (idx << PAGE_SECTORS_SHIFT);
+
+ if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ,
+ true))
+ return page;
+
+ md_error(mddev, rdev);
+ }
+
+ __free_page(page);
+ return ERR_PTR(-EIO);
+}
+
+static void llbitmap_write_page(struct llbitmap *llbitmap, int idx)
+{
+ struct page *page = llbitmap->pctl[idx]->page;
+ struct mddev *mddev = llbitmap->mddev;
+ struct md_rdev *rdev;
+ int block;
+
+ for (block = 0; block < llbitmap->blocks_per_page; block++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
+
+ if (!test_and_clear_bit(block, pctl->dirty))
+ continue;
+
+ rdev_for_each(rdev, mddev) {
+ sector_t sector;
+ sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT;
+
+ if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+ continue;
+
+ sector = mddev->bitmap_info.offset + rdev->sb_start +
+ (idx << PAGE_SECTORS_SHIFT) +
+ block * bit_sector;
+ md_write_metadata(mddev, rdev, sector,
+ llbitmap->io_size, page,
+ block * llbitmap->io_size);
+ }
+ }
+}
+
+static void active_release(struct percpu_ref *ref)
+{
+ struct llbitmap_page_ctl *pctl =
+ container_of(ref, struct llbitmap_page_ctl, active);
+
+ wake_up(&pctl->wait);
+}
+
+static void llbitmap_free_pages(struct llbitmap *llbitmap)
+{
+ int i;
+
+ if (!llbitmap->pctl)
+ return;
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ if (!pctl || !pctl->page)
+ break;
+
+ __free_page(pctl->page);
+ percpu_ref_exit(&pctl->active);
+ }
+
+ kfree(llbitmap->pctl[0]);
+ kfree(llbitmap->pctl);
+ llbitmap->pctl = NULL;
+}
+
+static int llbitmap_cache_pages(struct llbitmap *llbitmap)
+{
+ struct llbitmap_page_ctl *pctl;
+ unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks +
+ BITMAP_DATA_OFFSET, PAGE_SIZE);
+ unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS(
+ llbitmap->blocks_per_page));
+ int i;
+
+ llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!llbitmap->pctl)
+ return -ENOMEM;
+
+ size = round_up(size, cache_line_size());
+ pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO);
+ if (!pctl) {
+ kfree(llbitmap->pctl);
+ return -ENOMEM;
+ }
+
+ llbitmap->nr_pages = nr_pages;
+
+ for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) {
+ struct page *page = llbitmap_read_page(llbitmap, i);
+
+ llbitmap->pctl[i] = pctl;
+
+ if (IS_ERR(page)) {
+ llbitmap_free_pages(llbitmap);
+ return PTR_ERR(page);
+ }
+
+ if (percpu_ref_init(&pctl->active, active_release,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ __free_page(page);
+ llbitmap_free_pages(llbitmap);
+ return -ENOMEM;
+ }
+
+ pctl->page = page;
+ pctl->state = page_address(page);
+ init_waitqueue_head(&pctl->wait);
+ }
+
+ return 0;
+}
+
+static void llbitmap_init_state(struct llbitmap *llbitmap)
+{
+ enum llbitmap_state state = BitUnwritten;
+ unsigned long i;
+
+ if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
+ state = BitClean;
+
+ for (i = 0; i < llbitmap->chunks; i++)
+ llbitmap_write(llbitmap, state, i);
+}
+
+/* The return value is only used from resync, where @start == @end. */
+static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
+ unsigned long start,
+ unsigned long end,
+ enum llbitmap_action action)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ enum llbitmap_state state = BitNone;
+ bool level_456 = raid_is_456(llbitmap->mddev);
+ bool need_resync = false;
+ bool need_recovery = false;
+
+ if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
+ return BitNone;
+
+ if (action == BitmapActionInit) {
+ llbitmap_init_state(llbitmap);
+ return BitNone;
+ }
+
+ while (start <= end) {
+ enum llbitmap_state c = llbitmap_read(llbitmap, start);
+
+ if (c < 0 || c >= BitStateCount) {
+ pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n",
+ __func__, start, c, action);
+ state = BitNeedSync;
+ goto write_bitmap;
+ }
+
+ if (c == BitNeedSync)
+ need_resync = !mddev->degraded;
+
+ state = state_machine[c][action];
+
+write_bitmap:
+ if (unlikely(mddev->degraded)) {
+ /* For degraded array, mark new data as need sync. */
+ if (state == BitDirty &&
+ action == BitmapActionStartwrite)
+ state = BitNeedSync;
+ /*
+ * For degraded array, resync dirty data as well, noted
+ * if array is still degraded after resync is done, all
+ * new data will still be dirty until array is clean.
+ */
+ else if (c == BitDirty &&
+ action == BitmapActionStartsync)
+ state = BitSyncing;
+ } else if (c == BitUnwritten && state == BitDirty &&
+ action == BitmapActionStartwrite && level_456) {
+ /* Delay raid456 initial recovery to first write. */
+ state = BitNeedSync;
+ }
+
+ if (state == BitNone) {
+ start++;
+ continue;
+ }
+
+ llbitmap_write(llbitmap, state, start);
+
+ if (state == BitNeedSync)
+ need_resync = !mddev->degraded;
+ else if (state == BitDirty &&
+ !timer_pending(&llbitmap->pending_timer))
+ mod_timer(&llbitmap->pending_timer,
+ jiffies + mddev->bitmap_info.daemon_sleep * HZ);
+
+ start++;
+ }
+
+ if (need_resync && level_456)
+ need_recovery = true;
+
+ if (need_recovery) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ } else if (need_resync) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+
+ return state;
+}
+
+static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+retry:
+ if (likely(percpu_ref_tryget_live(&pctl->active))) {
+ WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ);
+ return;
+ }
+
+ wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active));
+ goto retry;
+}
+
+static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+ percpu_ref_put(&pctl->active);
+}
+
+static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+ percpu_ref_kill(&pctl->active);
+
+ if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
+ llbitmap->mddev->bitmap_info.daemon_sleep * HZ))
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+ pctl->expire = LONG_MAX;
+ percpu_ref_resurrect(&pctl->active);
+ wake_up(&pctl->wait);
+}
+
+static int llbitmap_check_support(struct mddev *mddev)
+{
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n",
+ mdname(mddev));
+ return -EBUSY;
+ }
+
+ if (mddev->bitmap_info.space == 0) {
+ if (mddev->bitmap_info.default_space == 0) {
+ pr_notice("md/llbitmap: %s: no space for bitmap\n",
+ mdname(mddev));
+ return -ENOSPC;
+ }
+ }
+
+ if (!mddev->persistent) {
+ pr_notice("md/llbitmap: %s: array must be persistent\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ if (mddev->bitmap_info.file) {
+ pr_notice("md/llbitmap: %s: doesn't support bitmap file\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ if (mddev->bitmap_info.external) {
+ pr_notice("md/llbitmap: %s: doesn't support external metadata\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ if (mddev_is_dm(mddev)) {
+ pr_notice("md/llbitmap: %s: doesn't support dm-raid\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int llbitmap_init(struct llbitmap *llbitmap)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ sector_t blocks = mddev->resync_max_sectors;
+ unsigned long chunksize = MIN_CHUNK_SIZE;
+ unsigned long chunks = DIV_ROUND_UP(blocks, chunksize);
+ unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT;
+ int ret;
+
+ while (chunks > space) {
+ chunksize = chunksize << 1;
+ chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
+ }
+
+ llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
+ llbitmap->chunkshift = ffz(~chunksize);
+ llbitmap->chunksize = chunksize;
+ llbitmap->chunks = chunks;
+ mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP;
+
+ ret = llbitmap_cache_pages(llbitmap);
+ if (ret)
+ return ret;
+
+ llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+ BitmapActionInit);
+ /* flush initial llbitmap to disk */
+ __llbitmap_flush(mddev);
+
+ return 0;
+}
+
+static int llbitmap_read_sb(struct llbitmap *llbitmap)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ unsigned long daemon_sleep;
+ unsigned long chunksize;
+ unsigned long events;
+ struct page *sb_page;
+ bitmap_super_t *sb;
+ int ret = -EINVAL;
+
+ if (!mddev->bitmap_info.offset) {
+ pr_err("md/llbitmap: %s: no super block found", mdname(mddev));
+ return -EINVAL;
+ }
+
+ sb_page = llbitmap_read_page(llbitmap, 0);
+ if (IS_ERR(sb_page)) {
+ pr_err("md/llbitmap: %s: read super block failed",
+ mdname(mddev));
+ return -EIO;
+ }
+
+ sb = kmap_local_page(sb_page);
+ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
+ pr_err("md/llbitmap: %s: invalid super block magic number",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) {
+ pr_err("md/llbitmap: %s: invalid super block version",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (memcmp(sb->uuid, mddev->uuid, 16)) {
+ pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (mddev->bitmap_info.space == 0) {
+ int room = le32_to_cpu(sb->sectors_reserved);
+
+ if (room)
+ mddev->bitmap_info.space = room;
+ else
+ mddev->bitmap_info.space = mddev->bitmap_info.default_space;
+ }
+ llbitmap->flags = le32_to_cpu(sb->state);
+ if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) {
+ ret = llbitmap_init(llbitmap);
+ goto out_put_page;
+ }
+
+ chunksize = le32_to_cpu(sb->chunksize);
+ if (!is_power_of_2(chunksize)) {
+ pr_err("md/llbitmap: %s: chunksize not a power of 2",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors,
+ mddev->bitmap_info.space << SECTOR_SHIFT)) {
+ pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu",
+ mdname(mddev), chunksize, mddev->resync_max_sectors,
+ mddev->bitmap_info.space);
+ goto out_put_page;
+ }
+
+ daemon_sleep = le32_to_cpu(sb->daemon_sleep);
+ if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) {
+ pr_err("md/llbitmap: %s: daemon sleep %lu period out of range",
+ mdname(mddev), daemon_sleep);
+ goto out_put_page;
+ }
+
+ events = le64_to_cpu(sb->events);
+ if (events < mddev->events) {
+ pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery",
+ mdname(mddev), events, mddev->events);
+ set_bit(BITMAP_STALE, &llbitmap->flags);
+ }
+
+ sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
+ mddev->bitmap_info.chunksize = chunksize;
+ mddev->bitmap_info.daemon_sleep = daemon_sleep;
+
+ llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
+ llbitmap->chunksize = chunksize;
+ llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize);
+ llbitmap->chunkshift = ffz(~chunksize);
+ ret = llbitmap_cache_pages(llbitmap);
+
+out_put_page:
+ __free_page(sb_page);
+ kunmap_local(sb);
+ return ret;
+}
+
+static void llbitmap_pending_timer_fn(struct timer_list *pending_timer)
+{
+ struct llbitmap *llbitmap =
+ container_of(pending_timer, struct llbitmap, pending_timer);
+
+ if (work_busy(&llbitmap->daemon_work)) {
+ pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n",
+ mdname(llbitmap->mddev),
+ llbitmap->mddev->bitmap_info.daemon_sleep);
+ set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
+ return;
+ }
+
+ queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
+}
+
+static void md_llbitmap_daemon_fn(struct work_struct *work)
+{
+ struct llbitmap *llbitmap =
+ container_of(work, struct llbitmap, daemon_work);
+ unsigned long start;
+ unsigned long end;
+ bool restart;
+ int idx;
+
+ if (llbitmap->mddev->degraded)
+ return;
+retry:
+ start = 0;
+ end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1;
+ restart = false;
+
+ for (idx = 0; idx < llbitmap->nr_pages; idx++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
+
+ if (idx > 0) {
+ start = end + 1;
+ end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
+ }
+
+ if (!test_bit(LLPageFlush, &pctl->flags) &&
+ time_before(jiffies, pctl->expire)) {
+ restart = true;
+ continue;
+ }
+
+ if (llbitmap_suspend_timeout(llbitmap, idx) < 0) {
+ pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n",
+ mdname(llbitmap->mddev), __func__, idx);
+ continue;
+ }
+
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
+ llbitmap_resume(llbitmap, idx);
+ }
+
+ /*
+ * If the daemon took a long time to finish, retry to prevent missing
+ * clearing dirty bits.
+ */
+ if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
+ goto retry;
+
+ /* If some page is dirty but not expired, setup timer again */
+ if (restart)
+ mod_timer(&llbitmap->pending_timer,
+ jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
+}
+
+static int llbitmap_create(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap;
+ int ret;
+
+ ret = llbitmap_check_support(mddev);
+ if (ret)
+ return ret;
+
+ llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL);
+ if (!llbitmap)
+ return -ENOMEM;
+
+ llbitmap->mddev = mddev;
+ llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0);
+ llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size;
+
+ timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0);
+ INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn);
+ atomic_set(&llbitmap->behind_writes, 0);
+ init_waitqueue_head(&llbitmap->behind_wait);
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ mddev->bitmap = llbitmap;
+ ret = llbitmap_read_sb(llbitmap);
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ if (ret) {
+ kfree(llbitmap);
+ mddev->bitmap = NULL;
+ }
+
+ return ret;
+}
+
+static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long chunks;
+
+ if (chunksize == 0)
+ chunksize = llbitmap->chunksize;
+
+ /* If there is enough space, leave the chunksize unchanged. */
+ chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
+ while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) {
+ chunksize = chunksize << 1;
+ chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
+ }
+
+ llbitmap->chunkshift = ffz(~chunksize);
+ llbitmap->chunksize = chunksize;
+ llbitmap->chunks = chunks;
+
+ return 0;
+}
+
+static int llbitmap_load(struct mddev *mddev)
+{
+ enum llbitmap_action action = BitmapActionReload;
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags))
+ action = BitmapActionStale;
+
+ llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action);
+ return 0;
+}
+
+static void llbitmap_destroy(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (!llbitmap)
+ return;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+
+ timer_delete_sync(&llbitmap->pending_timer);
+ flush_workqueue(md_llbitmap_io_wq);
+ flush_workqueue(md_llbitmap_unplug_wq);
+
+ mddev->bitmap = NULL;
+ llbitmap_free_pages(llbitmap);
+ kfree(llbitmap);
+ mutex_unlock(&mddev->bitmap_info.mutex);
+}
+
+static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = offset >> llbitmap->chunkshift;
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
+
+ while (page_start <= page_end) {
+ llbitmap_raise_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = offset >> llbitmap->chunkshift;
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ while (page_start <= page_end) {
+ llbitmap_release_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
+
+ while (page_start <= page_end) {
+ llbitmap_raise_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ while (page_start <= page_end) {
+ llbitmap_release_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_unplug_fn(struct work_struct *work)
+{
+ struct llbitmap_unplug_work *unplug_work =
+ container_of(work, struct llbitmap_unplug_work, work);
+ struct llbitmap *llbitmap = unplug_work->llbitmap;
+ struct blk_plug plug;
+ int i;
+
+ blk_start_plug(&plug);
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) ||
+ !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
+ continue;
+
+ llbitmap_write_page(llbitmap, i);
+ }
+
+ blk_finish_plug(&plug);
+ md_super_wait(llbitmap->mddev);
+ complete(unplug_work->done);
+}
+
+static bool llbitmap_dirty(struct llbitmap *llbitmap)
+{
+ int i;
+
+ for (i = 0; i < llbitmap->nr_pages; i++)
+ if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
+ return true;
+
+ return false;
+}
+
+static void llbitmap_unplug(struct mddev *mddev, bool sync)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct llbitmap *llbitmap = mddev->bitmap;
+ struct llbitmap_unplug_work unplug_work = {
+ .llbitmap = llbitmap,
+ .done = &done,
+ };
+
+ if (!llbitmap_dirty(llbitmap))
+ return;
+
+ /*
+ * Issue new bitmap IO under submit_bio() context will deadlock:
+ * - the bio will wait for bitmap bio to be done, before it can be
+ * issued;
+ * - bitmap bio will be added to current->bio_list and wait for this
+ * bio to be issued;
+ */
+ INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
+ queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&unplug_work.work);
+}
+
+/*
+ * Force to write all bitmap pages to disk, called when stopping the array, or
+ * every daemon_sleep seconds when sync_thread is running.
+ */
+static void __llbitmap_flush(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ struct blk_plug plug;
+ int i;
+
+ blk_start_plug(&plug);
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ /* mark all blocks as dirty */
+ set_bit(LLPageDirty, &pctl->flags);
+ bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
+ llbitmap_write_page(llbitmap, i);
+ }
+ blk_finish_plug(&plug);
+ md_super_wait(llbitmap->mddev);
+}
+
+static void llbitmap_flush(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ int i;
+
+ for (i = 0; i < llbitmap->nr_pages; i++)
+ set_bit(LLPageFlush, &llbitmap->pctl[i]->flags);
+
+ timer_delete_sync(&llbitmap->pending_timer);
+ queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
+ flush_work(&llbitmap->daemon_work);
+
+ __llbitmap_flush(mddev);
+}
+
+/* This is used for raid5 lazy initial recovery */
+static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+ enum llbitmap_state c = llbitmap_read(llbitmap, p);
+
+ return c == BitClean || c == BitDirty;
+}
+
+static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+ int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
+ enum llbitmap_state c = llbitmap_read(llbitmap, p);
+
+ /* always skip unwritten blocks */
+ if (c == BitUnwritten)
+ return blocks;
+
+ /* For degraded array, don't skip */
+ if (mddev->degraded)
+ return 0;
+
+ /* For resync also skip clean/dirty blocks */
+ if ((c == BitClean || c == BitDirty) &&
+ test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ return blocks;
+
+ return 0;
+}
+
+static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks, bool degraded)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+
+ /*
+ * Handle one bit at a time, this is much simpler. And it doesn't matter
+ * if md_do_sync() loop more times.
+ */
+ *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
+ return llbitmap_state_machine(llbitmap, p, p,
+ BitmapActionStartsync) == BitSyncing;
+}
+
+/* Something is wrong, sync_thread stop at @offset */
+static void llbitmap_end_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+
+ *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
+ llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1,
+ BitmapActionAbortsync);
+}
+
+/* A full sync_thread is finished */
+static void llbitmap_close_sync(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ int i;
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ /* let daemon_fn clear dirty bits immediately */
+ WRITE_ONCE(pctl->expire, jiffies);
+ }
+
+ llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+ BitmapActionEndsync);
+}
+
+/*
+ * sync_thread have reached @sector, update metadata every daemon_sleep seconds,
+ * just in case sync_thread have to restart after power failure.
+ */
+static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
+ bool force)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (sector == 0) {
+ llbitmap->last_end_sync = jiffies;
+ return;
+ }
+
+ if (time_before(jiffies, llbitmap->last_end_sync +
+ HZ * mddev->bitmap_info.daemon_sleep))
+ return;
+
+ wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+
+ mddev->curr_resync_completed = sector;
+ set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+ llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift,
+ BitmapActionEndsync);
+ __llbitmap_flush(mddev);
+
+ llbitmap->last_end_sync = jiffies;
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
+}
+
+static bool llbitmap_enabled(void *data, bool flush)
+{
+ struct llbitmap *llbitmap = data;
+
+ return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
+}
+
+static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s,
+ unsigned long e)
+{
+ llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite);
+}
+
+static void llbitmap_write_sb(struct llbitmap *llbitmap)
+{
+ int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size);
+
+ bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks);
+ llbitmap_write_page(llbitmap, 0);
+ md_super_wait(llbitmap->mddev);
+}
+
+static void llbitmap_update_sb(void *data)
+{
+ struct llbitmap *llbitmap = data;
+ struct mddev *mddev = llbitmap->mddev;
+ struct page *sb_page;
+ bitmap_super_t *sb;
+
+ if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
+ return;
+
+ sb_page = llbitmap_read_page(llbitmap, 0);
+ if (IS_ERR(sb_page)) {
+ pr_err("%s: %s: read super block failed", __func__,
+ mdname(mddev));
+ set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
+ return;
+ }
+
+ if (mddev->events < llbitmap->events_cleared)
+ llbitmap->events_cleared = mddev->events;
+
+ sb = kmap_local_page(sb_page);
+ sb->events = cpu_to_le64(mddev->events);
+ sb->state = cpu_to_le32(llbitmap->flags);
+ sb->chunksize = cpu_to_le32(llbitmap->chunksize);
+ sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
+ sb->events_cleared = cpu_to_le64(llbitmap->events_cleared);
+ sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space);
+ sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep);
+
+ kunmap_local(sb);
+ llbitmap_write_sb(llbitmap);
+}
+
+static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats)
+{
+ struct llbitmap *llbitmap = data;
+
+ memset(stats, 0, sizeof(*stats));
+
+ stats->missing_pages = 0;
+ stats->pages = llbitmap->nr_pages;
+ stats->file_pages = llbitmap->nr_pages;
+
+ stats->behind_writes = atomic_read(&llbitmap->behind_writes);
+ stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait);
+ stats->events_cleared = llbitmap->events_cleared;
+
+ return 0;
+}
+
+/* just flag all pages as needing to be written */
+static void llbitmap_write_all(struct mddev *mddev)
+{
+ int i;
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ set_bit(LLPageDirty, &pctl->flags);
+ bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
+ }
+}
+
+static void llbitmap_start_behind_write(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ atomic_inc(&llbitmap->behind_writes);
+}
+
+static void llbitmap_end_behind_write(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (atomic_dec_and_test(&llbitmap->behind_writes))
+ wake_up(&llbitmap->behind_wait);
+}
+
+static void llbitmap_wait_behind_writes(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (!llbitmap)
+ return;
+
+ wait_event(llbitmap->behind_wait,
+ atomic_read(&llbitmap->behind_writes) == 0);
+
+}
+
+static ssize_t bits_show(struct mddev *mddev, char *page)
+{
+ struct llbitmap *llbitmap;
+ int bits[BitStateCount] = {0};
+ loff_t start = 0;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ llbitmap = mddev->bitmap;
+ if (!llbitmap || !llbitmap->pctl) {
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "no bitmap\n");
+ }
+
+ if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) {
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "bitmap io error\n");
+ }
+
+ while (start < llbitmap->chunks) {
+ enum llbitmap_state c = llbitmap_read(llbitmap, start);
+
+ if (c < 0 || c >= BitStateCount)
+ pr_err("%s: invalid bit %llu state %d\n",
+ __func__, start, c);
+ else
+ bits[c]++;
+ start++;
+ }
+
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
+ bits[BitUnwritten], bits[BitClean], bits[BitDirty],
+ bits[BitNeedSync], bits[BitSyncing]);
+}
+
+static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
+
+static ssize_t metadata_show(struct mddev *mddev, char *page)
+{
+ struct llbitmap *llbitmap;
+ ssize_t ret;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ llbitmap = mddev->bitmap;
+ if (!llbitmap) {
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "no bitmap\n");
+ }
+
+ ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n",
+ llbitmap->chunksize, llbitmap->chunkshift,
+ llbitmap->chunks, mddev->bitmap_info.offset,
+ llbitmap->mddev->bitmap_info.daemon_sleep);
+ mutex_unlock(&mddev->bitmap_info.mutex);
+
+ return ret;
+}
+
+static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata);
+
+static ssize_t
+daemon_sleep_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep);
+}
+
+static ssize_t
+daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned long timeout;
+ int rv = kstrtoul(buf, 10, &timeout);
+
+ if (rv)
+ return rv;
+
+ mddev->bitmap_info.daemon_sleep = timeout;
+ return len;
+}
+
+static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep);
+
+static ssize_t
+barrier_idle_show(struct mddev *mddev, char *page)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ return sprintf(page, "%lu\n", llbitmap->barrier_idle);
+}
+
+static ssize_t
+barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long timeout;
+ int rv = kstrtoul(buf, 10, &timeout);
+
+ if (rv)
+ return rv;
+
+ llbitmap->barrier_idle = timeout;
+ return len;
+}
+
+static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
+
+static struct attribute *md_llbitmap_attrs[] = {
+ &llbitmap_bits.attr,
+ &llbitmap_metadata.attr,
+ &llbitmap_daemon_sleep.attr,
+ &llbitmap_barrier_idle.attr,
+ NULL
+};
+
+static struct attribute_group md_llbitmap_group = {
+ .name = "llbitmap",
+ .attrs = md_llbitmap_attrs,
+};
+
+static struct bitmap_operations llbitmap_ops = {
+ .head = {
+ .type = MD_BITMAP,
+ .id = ID_LLBITMAP,
+ .name = "llbitmap",
+ },
+
+ .enabled = llbitmap_enabled,
+ .create = llbitmap_create,
+ .resize = llbitmap_resize,
+ .load = llbitmap_load,
+ .destroy = llbitmap_destroy,
+
+ .start_write = llbitmap_start_write,
+ .end_write = llbitmap_end_write,
+ .start_discard = llbitmap_start_discard,
+ .end_discard = llbitmap_end_discard,
+ .unplug = llbitmap_unplug,
+ .flush = llbitmap_flush,
+
+ .start_behind_write = llbitmap_start_behind_write,
+ .end_behind_write = llbitmap_end_behind_write,
+ .wait_behind_writes = llbitmap_wait_behind_writes,
+
+ .blocks_synced = llbitmap_blocks_synced,
+ .skip_sync_blocks = llbitmap_skip_sync_blocks,
+ .start_sync = llbitmap_start_sync,
+ .end_sync = llbitmap_end_sync,
+ .close_sync = llbitmap_close_sync,
+ .cond_end_sync = llbitmap_cond_end_sync,
+
+ .update_sb = llbitmap_update_sb,
+ .get_stats = llbitmap_get_stats,
+ .dirty_bits = llbitmap_dirty_bits,
+ .write_all = llbitmap_write_all,
+
+ .group = &md_llbitmap_group,
+};
+
+int md_llbitmap_init(void)
+{
+ md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (!md_llbitmap_io_wq)
+ return -ENOMEM;
+
+ md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (!md_llbitmap_unplug_wq) {
+ destroy_workqueue(md_llbitmap_io_wq);
+ md_llbitmap_io_wq = NULL;
+ return -ENOMEM;
+ }
+
+ return register_md_submodule(&llbitmap_ops.head);
+}
+
+void md_llbitmap_exit(void)
+{
+ destroy_workqueue(md_llbitmap_io_wq);
+ md_llbitmap_io_wq = NULL;
+ destroy_workqueue(md_llbitmap_unplug_wq);
+ md_llbitmap_unplug_wq = NULL;
+ unregister_md_submodule(&llbitmap_ops.head);
+}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index aebe12b0ee27..e5922a682953 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -79,16 +79,10 @@ static const char *action_name[NR_SYNC_ACTIONS] = {
[ACTION_IDLE] = "idle",
};
-/* pers_list is a list of registered personalities protected by pers_lock. */
-static LIST_HEAD(pers_list);
-static DEFINE_SPINLOCK(pers_lock);
+static DEFINE_XARRAY(md_submodule);
static const struct kobj_type md_ktype;
-const struct md_cluster_operations *md_cluster_ops;
-EXPORT_SYMBOL(md_cluster_ops);
-static struct module *md_cluster_mod;
-
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
@@ -100,13 +94,12 @@ static struct workqueue_struct *md_wq;
* workqueue whith reconfig_mutex grabbed.
*/
static struct workqueue_struct *md_misc_wq;
-struct workqueue_struct *md_bitmap_wq;
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
static void mddev_detach(struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
-static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
+static void md_wakeup_thread_directly(struct md_thread __rcu **thread);
/*
* Default number of read corrections we'll attempt on an rdev
@@ -117,32 +110,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
/* Default safemode delay: 200 msec */
#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
/*
- * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
- * is 1000 KB/sec, so the extra system load does not show up that much.
- * Increase it if you want to have more _guaranteed_ speed. Note that
- * the RAID driver will use the maximum available bandwidth if the IO
- * subsystem is idle. There is also an 'absolute maximum' reconstruction
- * speed limit - in case reconstruction slows down your system despite
- * idle IO detection.
+ * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
+ * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
+ * does not show up that much. Increase it if you want to have more guaranteed
+ * speed. Note that the RAID driver will use the maximum bandwidth
+ * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
+ *
+ * Background sync IO speed control:
+ *
+ * - below speed min:
+ * no limit;
+ * - above speed min and below speed max:
+ * a) if mddev is idle, then no limit;
+ * b) if mddev is busy handling normal IO, then limit inflight sync IO
+ * to sync_io_depth;
+ * - above speed max:
+ * sync IO can't be issued;
*
- * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
- * or /sys/block/mdX/md/sync_speed_{min,max}
+ * Following configurations can be changed via /proc/sys/dev/raid/ for system
+ * or /sys/block/mdX/md/ for one array.
*/
-
static int sysctl_speed_limit_min = 1000;
static int sysctl_speed_limit_max = 200000;
-static inline int speed_min(struct mddev *mddev)
+static int sysctl_sync_io_depth = 32;
+
+static int speed_min(struct mddev *mddev)
{
return mddev->sync_speed_min ?
mddev->sync_speed_min : sysctl_speed_limit_min;
}
-static inline int speed_max(struct mddev *mddev)
+static int speed_max(struct mddev *mddev)
{
return mddev->sync_speed_max ?
mddev->sync_speed_max : sysctl_speed_limit_max;
}
+static int sync_io_depth(struct mddev *mddev)
+{
+ return mddev->sync_io_depth ?
+ mddev->sync_io_depth : sysctl_sync_io_depth;
+}
+
static void rdev_uninit_serial(struct md_rdev *rdev)
{
if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
@@ -294,19 +303,26 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
static struct ctl_table_header *raid_table_header;
-static struct ctl_table raid_table[] = {
+static const struct ctl_table raid_table[] = {
{
.procname = "speed_limit_min",
.data = &sysctl_speed_limit_min,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "speed_limit_max",
.data = &sysctl_speed_limit_max,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_io_depth",
+ .data = &sysctl_sync_io_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
};
@@ -322,6 +338,8 @@ static int start_readonly;
* so all the races disappear.
*/
static bool create_on_open = true;
+static bool legacy_async_del_gendisk = true;
+static bool check_new_feature = true;
/*
* We have a system wide 'event count' that is incremented
@@ -619,9 +637,12 @@ static void __mddev_put(struct mddev *mddev)
mddev->ctime || mddev->hold_active)
return;
- /* Array is not configured at all, and not held active, so destroy it */
+ /*
+ * If array is freed by stopping array, MD_DELETED is set by
+ * do_md_stop(), MD_DELETED is still set here in case mddev is freed
+ * directly by closing a mddev that is created by create_on_open.
+ */
set_bit(MD_DELETED, &mddev->flags);
-
/*
* Call queue_work inside the spinlock so that flush_workqueue() after
* mddev_find will succeed in waiting for the work to be done.
@@ -629,6 +650,12 @@ static void __mddev_put(struct mddev *mddev)
queue_work(md_misc_wq, &mddev->del_work);
}
+static void mddev_put_locked(struct mddev *mddev)
+{
+ if (atomic_dec_and_test(&mddev->active))
+ __mddev_put(mddev);
+}
+
void mddev_put(struct mddev *mddev)
{
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
@@ -650,8 +677,66 @@ static void active_io_release(struct percpu_ref *ref)
static void no_op(struct percpu_ref *r) {}
+static bool mddev_set_bitmap_ops(struct mddev *mddev)
+{
+ struct bitmap_operations *old = mddev->bitmap_ops;
+ struct md_submodule_head *head;
+
+ if (mddev->bitmap_id == ID_BITMAP_NONE ||
+ (old && old->head.id == mddev->bitmap_id))
+ return true;
+
+ xa_lock(&md_submodule);
+ head = xa_load(&md_submodule, mddev->bitmap_id);
+
+ if (!head) {
+ pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id);
+ goto err;
+ }
+
+ if (head->type != MD_BITMAP) {
+ pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id);
+ goto err;
+ }
+
+ mddev->bitmap_ops = (void *)head;
+ xa_unlock(&md_submodule);
+
+ if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) {
+ if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group))
+ pr_warn("md: cannot register extra bitmap attributes for %s\n",
+ mdname(mddev));
+ else
+ /*
+ * Inform user with KOBJ_CHANGE about new bitmap
+ * attributes.
+ */
+ kobject_uevent(&mddev->kobj, KOBJ_CHANGE);
+ }
+ return true;
+
+err:
+ xa_unlock(&md_submodule);
+ return false;
+}
+
+static void mddev_clear_bitmap_ops(struct mddev *mddev)
+{
+ if (!mddev_is_dm(mddev) && mddev->bitmap_ops &&
+ mddev->bitmap_ops->group)
+ sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group);
+
+ mddev->bitmap_ops = NULL;
+}
+
int mddev_init(struct mddev *mddev)
{
+ int err = 0;
+
+ if (!IS_ENABLED(CONFIG_MD_BITMAP))
+ mddev->bitmap_id = ID_BITMAP_NONE;
+ else
+ mddev->bitmap_id = ID_BITMAP;
if (percpu_ref_init(&mddev->active_io, active_io_release,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
@@ -659,10 +744,23 @@ int mddev_init(struct mddev *mddev)
if (percpu_ref_init(&mddev->writes_pending, no_op,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
- percpu_ref_exit(&mddev->active_io);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto exit_acitve_io;
}
+ err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (err)
+ goto exit_writes_pending;
+
+ err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (err)
+ goto exit_bio_set;
+
+ err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
+ offsetof(struct md_io_clone, bio_clone), 0);
+ if (err)
+ goto exit_sync_set;
+
/* We want to start with the refcount at zero */
percpu_ref_put(&mddev->writes_pending);
@@ -686,17 +784,29 @@ int mddev_init(struct mddev *mddev)
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
- mddev_set_bitmap_ops(mddev);
INIT_WORK(&mddev->sync_work, md_start_sync);
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
return 0;
+
+exit_sync_set:
+ bioset_exit(&mddev->sync_set);
+exit_bio_set:
+ bioset_exit(&mddev->bio_set);
+exit_writes_pending:
+ percpu_ref_exit(&mddev->writes_pending);
+exit_acitve_io:
+ percpu_ref_exit(&mddev->active_io);
+ return err;
}
EXPORT_SYMBOL_GPL(mddev_init);
void mddev_destroy(struct mddev *mddev)
{
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
+ bioset_exit(&mddev->io_clone_set);
percpu_ref_exit(&mddev->active_io);
percpu_ref_exit(&mddev->writes_pending);
}
@@ -850,6 +960,22 @@ void mddev_unlock(struct mddev *mddev)
kobject_del(&rdev->kobj);
export_rdev(rdev, mddev);
}
+
+ if (!legacy_async_del_gendisk) {
+ /*
+ * Call del_gendisk after release reconfig_mutex to avoid
+ * deadlock (e.g. call del_gendisk under the lock and an
+ * access to sysfs files waits the lock)
+ * And MD_DELETED is only used for md raid which is set in
+ * do_md_stop. dm raid only uses md_stop to stop. So dm raid
+ * doesn't need to check MD_DELETED when getting reconfig lock
+ */
+ if (test_bit(MD_DELETED, &mddev->flags) &&
+ !test_and_set_bit(MD_DO_DELETE, &mddev->flags)) {
+ kobject_del(&mddev->kobj);
+ del_gendisk(mddev->gendisk);
+ }
+ }
}
EXPORT_SYMBOL_GPL(mddev_unlock);
@@ -888,16 +1014,40 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
}
EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
-static struct md_personality *find_pers(int level, char *clevel)
+static struct md_personality *get_pers(int level, char *clevel)
{
- struct md_personality *pers;
- list_for_each_entry(pers, &pers_list, list) {
- if (level != LEVEL_NONE && pers->level == level)
- return pers;
- if (strcmp(pers->name, clevel)==0)
- return pers;
+ struct md_personality *ret = NULL;
+ struct md_submodule_head *head;
+ unsigned long i;
+
+ xa_lock(&md_submodule);
+ xa_for_each(&md_submodule, i, head) {
+ if (head->type != MD_PERSONALITY)
+ continue;
+ if ((level != LEVEL_NONE && head->id == level) ||
+ !strcmp(head->name, clevel)) {
+ if (try_module_get(head->owner))
+ ret = (void *)head;
+ break;
+ }
}
- return NULL;
+ xa_unlock(&md_submodule);
+
+ if (!ret) {
+ if (level != LEVEL_NONE)
+ pr_warn("md: personality for level %d is not loaded!\n",
+ level);
+ else
+ pr_warn("md: personality for level %s is not loaded!\n",
+ clevel);
+ }
+
+ return ret;
+}
+
+static void put_pers(struct md_personality *pers)
+{
+ module_put(pers->head.owner);
}
/* return the offset of the super block in 512byte sectors */
@@ -956,15 +1106,26 @@ static void super_written(struct bio *bio)
wake_up(&mddev->sb_wait);
}
-void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
- sector_t sector, int size, struct page *page)
+/**
+ * md_write_metadata - write metadata to underlying disk, including
+ * array superblock, badblocks, bitmap superblock and bitmap bits.
+ * @mddev: the array to write
+ * @rdev: the underlying disk to write
+ * @sector: the offset to @rdev
+ * @size: the length of the metadata
+ * @page: the metadata
+ * @offset: the offset to @page
+ *
+ * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment
+ * mddev->pending_writes before returning, and decrement it on completion,
+ * waking up sb_wait. Caller must call md_super_wait() after issuing io to all
+ * rdev. If an error occurred, md_error() will be called, and the @rdev will be
+ * kicked out from @mddev.
+ */
+void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
+ sector_t sector, int size, struct page *page,
+ unsigned int offset)
{
- /* write first size bytes of page to sector of rdev
- * Increment mddev->pending_writes before returning
- * and decrement it on completion, waking up sb_wait
- * if zero is reached.
- * If an error occurred, call md_error
- */
struct bio *bio;
if (!page)
@@ -982,7 +1143,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
atomic_inc(&rdev->nr_pending);
bio->bi_iter.bi_sector = sector;
- __bio_add_page(bio, page, size, 0);
+ __bio_add_page(bio, page, size, offset);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
@@ -1180,7 +1341,7 @@ int md_check_no_bitmap(struct mddev *mddev)
if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
return 0;
pr_warn("%s: bitmaps are not supported for %s\n",
- mdname(mddev), mddev->pers->name);
+ mdname(mddev), mddev->pers->head.name);
return 1;
}
EXPORT_SYMBOL(md_check_no_bitmap);
@@ -1292,6 +1453,9 @@ static u64 md_bitmap_events_cleared(struct mddev *mddev)
struct md_bitmap_stats stats;
int err;
+ if (!md_bitmap_enabled(mddev, false))
+ return 0;
+
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (err)
return 0;
@@ -1355,13 +1519,13 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru
mddev->layout = -1;
if (sb->state & (1<<MD_SB_CLEAN))
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
else {
if (sb->events_hi == sb->cp_events_hi &&
sb->events_lo == sb->cp_events_lo) {
- mddev->recovery_cp = sb->recovery_cp;
+ mddev->resync_offset = sb->recovery_cp;
} else
- mddev->recovery_cp = 0;
+ mddev->resync_offset = 0;
}
memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
@@ -1487,10 +1651,10 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
mddev->minor_version = sb->minor_version;
if (mddev->in_sync)
{
- sb->recovery_cp = mddev->recovery_cp;
+ sb->recovery_cp = mddev->resync_offset;
sb->cp_events_hi = (mddev->events>>32);
sb->cp_events_lo = (u32)mddev->events;
- if (mddev->recovery_cp == MaxSector)
+ if (mddev->resync_offset == MaxSector)
sb->state = (1<< MD_SB_CLEAN);
} else
sb->recovery_cp = 0;
@@ -1589,8 +1753,8 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
num_sectors = (sector_t)(2ULL << 32) - 2;
do {
- md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
- rdev->sb_page);
+ md_write_metadata(rdev->mddev, rdev, rdev->sb_start,
+ rdev->sb_size, rdev->sb_page, 0);
} while (md_super_wait(rdev->mddev) < 0);
return num_sectors;
}
@@ -1688,9 +1852,13 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
}
if (sb->pad0 ||
sb->pad3[0] ||
- memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
- /* Some padding is non-zero, might be a new feature */
- return -EINVAL;
+ memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) {
+ pr_warn("Some padding is non-zero on %pg, might be a new feature\n",
+ rdev->bdev);
+ if (check_new_feature)
+ return -EINVAL;
+ pr_warn("check_new_feature is disabled, data corruption possible\n");
+ }
rdev->preferred_minor = 0xffff;
rdev->data_offset = le64_to_cpu(sb->data_offset);
@@ -1748,7 +1916,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
count <<= sb->bblog_shift;
if (bb + 1 == 0)
break;
- if (badblocks_set(&rdev->badblocks, sector, count, 1))
+ if (!badblocks_set(&rdev->badblocks, sector, count, 1))
return -EINVAL;
}
} else if (sb->bblog_offset != 0)
@@ -1831,6 +1999,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
mddev->layout = le32_to_cpu(sb->layout);
mddev->raid_disks = le32_to_cpu(sb->raid_disks);
mddev->dev_sectors = le64_to_cpu(sb->size);
+ mddev->logical_block_size = le32_to_cpu(sb->logical_block_size);
mddev->events = ev1;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.space = 0;
@@ -1841,7 +2010,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
mddev->bitmap_info.default_space = (4096-1024) >> 9;
mddev->reshape_backwards = 0;
- mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+ mddev->resync_offset = le64_to_cpu(sb->resync_offset);
memcpy(mddev->uuid, sb->set_uuid, 16);
mddev->max_disks = (4096-256)/2;
@@ -2027,7 +2196,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->utime = cpu_to_le64((__u64)mddev->utime);
sb->events = cpu_to_le64(mddev->events);
if (mddev->in_sync)
- sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ sb->resync_offset = cpu_to_le64(mddev->resync_offset);
else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
sb->resync_offset = cpu_to_le64(MaxSector);
else
@@ -2040,6 +2209,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
sb->level = cpu_to_le32(mddev->level);
sb->layout = cpu_to_le32(mddev->layout);
+ sb->logical_block_size = cpu_to_le32(mddev->logical_block_size);
if (test_bit(FailFast, &rdev->flags))
sb->devflags |= FailFast1;
else
@@ -2238,8 +2408,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
sb->super_offset = cpu_to_le64(rdev->sb_start);
sb->sb_csum = calc_sb_1_csum(sb);
do {
- md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
- rdev->sb_page);
+ md_write_metadata(rdev->mddev, rdev, rdev->sb_start,
+ rdev->sb_size, rdev->sb_page, 0);
} while (md_super_wait(rdev->mddev) < 0);
return num_sectors;
@@ -2249,13 +2419,15 @@ static int
super_1_allow_new_offset(struct md_rdev *rdev,
unsigned long long new_offset)
{
+ struct mddev *mddev = rdev->mddev;
+
/* All necessary checks on new >= old have been done */
if (new_offset >= rdev->data_offset)
return 1;
/* with 1.0 metadata, there is no metadata to tread on
* so we can always move back */
- if (rdev->mddev->minor_version == 0)
+ if (mddev->minor_version == 0)
return 1;
/* otherwise we must be sure not to step on
@@ -2267,8 +2439,7 @@ super_1_allow_new_offset(struct md_rdev *rdev,
if (rdev->sb_start + (32+4)*2 > new_offset)
return 0;
- if (!rdev->mddev->bitmap_info.file) {
- struct mddev *mddev = rdev->mddev;
+ if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) {
struct md_bitmap_stats stats;
int err;
@@ -2359,19 +2530,6 @@ int md_integrity_register(struct mddev *mddev)
return 0; /* shouldn't register */
pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
- if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
- (mddev->level != 1 && mddev->level != 10 &&
- bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
- /*
- * No need to handle the failure of bioset_integrity_create,
- * because the function is called by md_run() -> pers->run(),
- * md_run calls bioset_exit -> bioset_integrity_free in case
- * of failure case.
- */
- pr_err("md: failed to create integrity pool for %s\n",
- mdname(mddev));
- return -EINVAL;
- }
return 0;
}
EXPORT_SYMBOL(md_integrity_register);
@@ -2630,6 +2788,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
if (!md_is_rdwr(mddev)) {
if (force_change)
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
+ pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev));
return;
}
@@ -2639,11 +2798,11 @@ repeat:
force_change = 1;
if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
nospares = 1;
- ret = md_cluster_ops->metadata_update_start(mddev);
+ ret = mddev->cluster_ops->metadata_update_start(mddev);
/* Has someone else has updated the sb */
if (!does_sb_need_changing(mddev)) {
if (ret == 0)
- md_cluster_ops->metadata_update_cancel(mddev);
+ mddev->cluster_ops->metadata_update_cancel(mddev);
bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
BIT(MD_SB_CHANGE_DEVS) |
BIT(MD_SB_CHANGE_CLEAN));
@@ -2720,7 +2879,7 @@ repeat:
/* If this is just a dirty<->clean transition, and the array is clean
* and 'events' is odd, we can roll back to the previous clean state */
if (nospares
- && (mddev->in_sync && mddev->recovery_cp == MaxSector)
+ && (mddev->in_sync && mddev->resync_offset == MaxSector)
&& mddev->can_decrease_events
&& mddev->events != 1) {
mddev->events--;
@@ -2753,24 +2912,24 @@ repeat:
mddev_add_trace_msg(mddev, "md md_update_sb");
rewrite:
- mddev->bitmap_ops->update_sb(mddev->bitmap);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) {
if (rdev->sb_loaded != 1)
continue; /* no noise on spare devices */
if (!test_bit(Faulty, &rdev->flags)) {
- md_super_write(mddev,rdev,
- rdev->sb_start, rdev->sb_size,
- rdev->sb_page);
+ md_write_metadata(mddev, rdev, rdev->sb_start,
+ rdev->sb_size, rdev->sb_page, 0);
pr_debug("md: (write) %pg's sb offset: %llu\n",
rdev->bdev,
(unsigned long long)rdev->sb_start);
rdev->sb_events = mddev->events;
if (rdev->badblocks.size) {
- md_super_write(mddev, rdev,
- rdev->badblocks.sector,
- rdev->badblocks.size << 9,
- rdev->bb_page);
+ md_write_metadata(mddev, rdev,
+ rdev->badblocks.sector,
+ rdev->badblocks.size << 9,
+ rdev->bb_page, 0);
rdev->badblocks.size = 0;
}
@@ -2783,7 +2942,7 @@ rewrite:
/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
if (mddev_is_clustered(mddev) && ret == 0)
- md_cluster_ops->metadata_update_finish(mddev);
+ mddev->cluster_ops->metadata_update_finish(mddev);
if (mddev->in_sync != sync_req ||
!bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
@@ -2942,7 +3101,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
else {
err = 0;
if (mddev_is_clustered(mddev))
- err = md_cluster_ops->remove_disk(mddev, rdev);
+ err = mddev->cluster_ops->remove_disk(mddev, rdev);
if (err == 0) {
md_kick_rdev_from_array(rdev);
@@ -3052,7 +3211,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
* by this node eventually
*/
if (!mddev_is_clustered(rdev->mddev) ||
- (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
+ (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) {
clear_bit(Faulty, &rdev->flags);
err = add_bound_rdev(rdev);
}
@@ -3860,7 +4019,7 @@ level_show(struct mddev *mddev, char *page)
spin_lock(&mddev->lock);
p = mddev->pers;
if (p)
- ret = sprintf(page, "%s\n", p->name);
+ ret = sprintf(page, "%s\n", p->head.name);
else if (mddev->clevel[0])
ret = sprintf(page, "%s\n", mddev->clevel);
else if (mddev->level != LEVEL_NONE)
@@ -3917,7 +4076,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
rv = -EINVAL;
if (!mddev->pers->quiesce) {
pr_warn("md: %s: %s does not support online personality change\n",
- mdname(mddev), mddev->pers->name);
+ mdname(mddev), mddev->pers->head.name);
goto out_unlock;
}
@@ -3931,24 +4090,20 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
if (request_module("md-%s", clevel) != 0)
request_module("md-level-%s", clevel);
- spin_lock(&pers_lock);
- pers = find_pers(level, clevel);
- if (!pers || !try_module_get(pers->owner)) {
- spin_unlock(&pers_lock);
- pr_warn("md: personality %s not loaded\n", clevel);
+ pers = get_pers(level, clevel);
+ if (!pers) {
rv = -EINVAL;
goto out_unlock;
}
- spin_unlock(&pers_lock);
if (pers == mddev->pers) {
/* Nothing to do! */
- module_put(pers->owner);
+ put_pers(pers);
rv = len;
goto out_unlock;
}
if (!pers->takeover) {
- module_put(pers->owner);
+ put_pers(pers);
pr_warn("md: %s: %s does not support personality takeover\n",
mdname(mddev), clevel);
rv = -EINVAL;
@@ -3969,7 +4124,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->raid_disks -= mddev->delta_disks;
mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
- module_put(pers->owner);
+ put_pers(pers);
pr_warn("md: %s: %s would not accept array\n",
mdname(mddev), clevel);
rv = PTR_ERR(priv);
@@ -3984,7 +4139,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
oldpriv = mddev->private;
mddev->pers = pers;
mddev->private = priv;
- strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
mddev->level = mddev->new_level;
mddev->layout = mddev->new_layout;
mddev->chunk_sectors = mddev->new_chunk_sectors;
@@ -4026,7 +4181,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->to_remove = &md_redundancy_group;
}
- module_put(oldpers->owner);
+ put_pers(oldpers);
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0)
@@ -4057,7 +4212,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
* it must always be in_sync
*/
mddev->in_sync = 1;
- del_timer_sync(&mddev->safemode_timer);
+ timer_delete_sync(&mddev->safemode_timer);
}
pers->run(mddev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
@@ -4103,6 +4258,86 @@ static struct md_sysfs_entry md_new_level =
__ATTR(new_level, 0664, new_level_show, new_level_store);
static ssize_t
+bitmap_type_show(struct mddev *mddev, char *page)
+{
+ struct md_submodule_head *head;
+ unsigned long i;
+ ssize_t len = 0;
+
+ if (mddev->bitmap_id == ID_BITMAP_NONE)
+ len += sprintf(page + len, "[none] ");
+ else
+ len += sprintf(page + len, "none ");
+
+ xa_lock(&md_submodule);
+ xa_for_each(&md_submodule, i, head) {
+ if (head->type != MD_BITMAP)
+ continue;
+
+ if (mddev->bitmap_id == head->id)
+ len += sprintf(page + len, "[%s] ", head->name);
+ else
+ len += sprintf(page + len, "%s ", head->name);
+ }
+ xa_unlock(&md_submodule);
+
+ len += sprintf(page + len, "\n");
+ return len;
+}
+
+static ssize_t
+bitmap_type_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ struct md_submodule_head *head;
+ enum md_submodule_id id;
+ unsigned long i;
+ int err = 0;
+
+ xa_lock(&md_submodule);
+
+ if (mddev->bitmap_ops) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (cmd_match(buf, "none")) {
+ mddev->bitmap_id = ID_BITMAP_NONE;
+ goto out;
+ }
+
+ xa_for_each(&md_submodule, i, head) {
+ if (head->type == MD_BITMAP && cmd_match(buf, head->name)) {
+ mddev->bitmap_id = head->id;
+ goto out;
+ }
+ }
+
+ err = kstrtoint(buf, 10, &id);
+ if (err)
+ goto out;
+
+ if (id == ID_BITMAP_NONE) {
+ mddev->bitmap_id = id;
+ goto out;
+ }
+
+ head = xa_load(&md_submodule, id);
+ if (head && head->type == MD_BITMAP) {
+ mddev->bitmap_id = id;
+ goto out;
+ }
+
+ err = -ENOENT;
+
+out:
+ xa_unlock(&md_submodule);
+ return err ? err : len;
+}
+
+static struct md_sysfs_entry md_bitmap_type =
+__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store);
+
+static ssize_t
layout_show(struct mddev *mddev, char *page)
{
/* just a number, not meaningful for all levels */
@@ -4260,9 +4495,9 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
static ssize_t
resync_start_show(struct mddev *mddev, char *page)
{
- if (mddev->recovery_cp == MaxSector)
+ if (mddev->resync_offset == MaxSector)
return sprintf(page, "none\n");
- return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
+ return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset);
}
static ssize_t
@@ -4288,7 +4523,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
err = -EBUSY;
if (!err) {
- mddev->recovery_cp = n;
+ mddev->resync_offset = n;
if (mddev->pers)
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
}
@@ -4633,6 +4868,9 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
unsigned long chunk, end_chunk;
int err;
+ if (!md_bitmap_enabled(mddev, false))
+ return len;
+
err = mddev_lock(mddev);
if (err)
return err;
@@ -4792,9 +5030,42 @@ out_unlock:
static struct md_sysfs_entry md_metadata =
__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
+static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors)
+{
+ return rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < sectors;
+}
+
+static enum sync_action md_get_active_sync_action(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ bool is_recover = false;
+
+ if (mddev->resync_offset < MaxSector)
+ return ACTION_RESYNC;
+
+ if (mddev->reshape_position != MaxSector)
+ return ACTION_RESHAPE;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev_needs_recovery(rdev, MaxSector)) {
+ is_recover = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return is_recover ? ACTION_RECOVER : ACTION_IDLE;
+}
+
enum sync_action md_sync_action(struct mddev *mddev)
{
unsigned long recovery = mddev->recovery;
+ enum sync_action active_action;
/*
* frozen has the highest priority, means running sync_thread will be
@@ -4818,8 +5089,17 @@ enum sync_action md_sync_action(struct mddev *mddev)
!test_bit(MD_RECOVERY_NEEDED, &recovery))
return ACTION_IDLE;
- if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
- mddev->reshape_position != MaxSector)
+ /*
+ * Check if any sync operation (resync/recover/reshape) is
+ * currently active. This ensures that only one sync operation
+ * can run at a time. Returns the type of active operation, or
+ * ACTION_IDLE if none are active.
+ */
+ active_action = md_get_active_sync_action(mddev);
+ if (active_action != ACTION_IDLE)
+ return active_action;
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
return ACTION_RESHAPE;
if (test_bit(MD_RECOVERY_RECOVER, &recovery))
@@ -4893,7 +5173,7 @@ static void stop_sync_thread(struct mddev *mddev, bool locked)
* Thread might be blocked waiting for metadata update which will now
* never happen
*/
- md_wakeup_thread_directly(mddev->sync_thread);
+ md_wakeup_thread_directly(&mddev->sync_thread);
if (work_pending(&mddev->sync_work))
flush_work(&mddev->sync_work);
@@ -5084,7 +5364,7 @@ static ssize_t
sync_min_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_min(mddev),
- mddev->sync_speed_min ? "local": "system");
+ mddev->sync_speed_min ? "local" : "system");
}
static ssize_t
@@ -5093,7 +5373,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int min;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
min = 0;
} else {
rv = kstrtouint(buf, 10, &min);
@@ -5113,7 +5393,7 @@ static ssize_t
sync_max_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_max(mddev),
- mddev->sync_speed_max ? "local": "system");
+ mddev->sync_speed_max ? "local" : "system");
}
static ssize_t
@@ -5122,7 +5402,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int max;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
max = 0;
} else {
rv = kstrtouint(buf, 10, &max);
@@ -5139,6 +5419,35 @@ static struct md_sysfs_entry md_sync_max =
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
static ssize_t
+sync_io_depth_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
+ mddev->sync_io_depth ? "local" : "system");
+}
+
+static ssize_t
+sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned int max;
+ int rv;
+
+ if (strncmp(buf, "system", 6) == 0) {
+ max = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &max);
+ if (rv < 0)
+ return rv;
+ if (max == 0)
+ return -EINVAL;
+ }
+ mddev->sync_io_depth = max;
+ return len;
+}
+
+static struct md_sysfs_entry md_sync_io_depth =
+__ATTR_RW(sync_io_depth);
+
+static ssize_t
degraded_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d\n", mddev->degraded);
@@ -5584,7 +5893,7 @@ __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
{
- if (mddev->pers == NULL || (mddev->pers->level != 1))
+ if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1))
return sprintf(page, "n/a\n");
else
return sprintf(page, "%d\n", mddev->serialize_policy);
@@ -5610,7 +5919,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
err = mddev_suspend_and_lock(mddev);
if (err)
return err;
- if (mddev->pers == NULL || (mddev->pers->level != 1)) {
+ if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) {
pr_err("md: serialize_policy is only effective for raid1\n");
err = -EINVAL;
goto unlock;
@@ -5630,10 +5939,73 @@ static struct md_sysfs_entry md_serialize_policy =
__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
serialize_policy_store);
+static int mddev_set_logical_block_size(struct mddev *mddev,
+ unsigned int lbs)
+{
+ int err = 0;
+ struct queue_limits lim;
+
+ if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) {
+ pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n",
+ mdname(mddev), lbs);
+ return -EINVAL;
+ }
+
+ lim = queue_limits_start_update(mddev->gendisk->queue);
+ lim.logical_block_size = lbs;
+ pr_info("%s: logical_block_size is changed, data may be lost\n",
+ mdname(mddev));
+ err = queue_limits_commit_update(mddev->gendisk->queue, &lim);
+ if (err)
+ return err;
+
+ mddev->logical_block_size = lbs;
+ /* New lbs will be written to superblock after array is running */
+ set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
+ return 0;
+}
+
+static ssize_t
+lbs_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%u\n", mddev->logical_block_size);
+}
+
+static ssize_t
+lbs_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned int lbs;
+ int err = -EBUSY;
+
+ /* Only 1.x meta supports configurable LBS */
+ if (mddev->major_version == 0)
+ return -EINVAL;
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ err = kstrtouint(buf, 10, &lbs);
+ if (err < 0)
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ goto unlock;
+
+ err = mddev_set_logical_block_size(mddev, lbs);
+
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry md_logical_block_size =
+__ATTR(logical_block_size, 0644, lbs_show, lbs_store);
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_new_level.attr,
+ &md_bitmap_type.attr,
&md_layout.attr,
&md_raid_disks.attr,
&md_uuid.attr,
@@ -5651,6 +6023,7 @@ static struct attribute *md_default_attrs[] = {
&md_consistency_policy.attr,
&md_fail_last_dev.attr,
&md_serialize_policy.attr,
+ &md_logical_block_size.attr,
NULL,
};
@@ -5664,6 +6037,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_mismatches.attr,
&md_sync_min.attr,
&md_sync_max.attr,
+ &md_sync_io_depth.attr,
&md_sync_speed.attr,
&md_sync_force_parallel.attr,
&md_sync_completed.attr,
@@ -5682,7 +6056,6 @@ static const struct attribute_group md_redundancy_group = {
static const struct attribute_group *md_attr_groups[] = {
&md_default_group,
- &md_bitmap_group,
NULL,
};
@@ -5714,19 +6087,30 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
struct mddev *mddev = container_of(kobj, struct mddev, kobj);
ssize_t rv;
+ struct kernfs_node *kn = NULL;
if (!entry->store)
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
+
+ if (entry->store == array_state_store && cmd_match(page, "clear"))
+ kn = sysfs_break_active_protection(kobj, attr);
+
spin_lock(&all_mddevs_lock);
if (!mddev_get(mddev)) {
spin_unlock(&all_mddevs_lock);
+ if (kn)
+ sysfs_unbreak_active_protection(kn);
return -EBUSY;
}
spin_unlock(&all_mddevs_lock);
rv = entry->store(mddev, page, length);
mddev_put(mddev);
+
+ if (kn)
+ sysfs_unbreak_active_protection(kn);
+
return rv;
}
@@ -5734,12 +6118,13 @@ static void md_kobj_release(struct kobject *ko)
{
struct mddev *mddev = container_of(ko, struct mddev, kobj);
- if (mddev->sysfs_state)
- sysfs_put(mddev->sysfs_state);
- if (mddev->sysfs_level)
- sysfs_put(mddev->sysfs_level);
-
- del_gendisk(mddev->gendisk);
+ if (legacy_async_del_gendisk) {
+ if (mddev->sysfs_state)
+ sysfs_put(mddev->sysfs_state);
+ if (mddev->sysfs_level)
+ sysfs_put(mddev->sysfs_level);
+ del_gendisk(mddev->gendisk);
+ }
put_disk(mddev->gendisk);
}
@@ -5769,6 +6154,17 @@ int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
return -EINVAL;
}
+ /*
+ * Before RAID adding folio support, the logical_block_size
+ * should be smaller than the page size.
+ */
+ if (lim->logical_block_size > PAGE_SIZE) {
+ pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ mddev->logical_block_size = lim->logical_block_size;
+
return 0;
}
EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
@@ -5781,6 +6177,13 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
if (mddev_is_dm(mddev))
return 0;
+ if (queue_logical_block_size(rdev->bdev->bd_disk->queue) >
+ queue_logical_block_size(mddev->gendisk->queue)) {
+ pr_err("%s: incompatible logical_block_size, can not add\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
lim = queue_limits_start_update(mddev->gendisk->queue);
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
mddev->gendisk->disk_name);
@@ -5943,6 +6346,9 @@ static int md_alloc_and_put(dev_t dev, char *name)
{
struct mddev *mddev = md_alloc(dev, name);
+ if (legacy_async_del_gendisk)
+ pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n");
+
if (IS_ERR(mddev))
return PTR_ERR(mddev);
mddev_put(mddev);
@@ -5988,7 +6394,7 @@ static int add_named_array(const char *val, const struct kernel_param *kp)
static void md_safemode_timeout(struct timer_list *t)
{
- struct mddev *mddev = from_timer(mddev, t, safemode_timer);
+ struct mddev *mddev = timer_container_of(mddev, t, safemode_timer);
mddev->safemode = 1;
if (mddev->external)
@@ -5999,6 +6405,26 @@ static void md_safemode_timeout(struct timer_list *t)
static int start_dirty_degraded;
+static int md_bitmap_create(struct mddev *mddev)
+{
+ if (mddev->bitmap_id == ID_BITMAP_NONE)
+ return -EINVAL;
+
+ if (!mddev_set_bitmap_ops(mddev))
+ return -ENOENT;
+
+ return mddev->bitmap_ops->create(mddev);
+}
+
+static void md_bitmap_destroy(struct mddev *mddev)
+{
+ if (!md_bitmap_registered(mddev))
+ return;
+
+ mddev->bitmap_ops->destroy(mddev);
+ mddev_clear_bitmap_ops(mddev);
+}
+
int md_run(struct mddev *mddev)
{
int err;
@@ -6078,50 +6504,20 @@ int md_run(struct mddev *mddev)
nowait = nowait && bdev_nowait(rdev->bdev);
}
- if (!bioset_initialized(&mddev->bio_set)) {
- err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (err)
- return err;
- }
- if (!bioset_initialized(&mddev->sync_set)) {
- err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (err)
- goto exit_bio_set;
- }
-
- if (!bioset_initialized(&mddev->io_clone_set)) {
- err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
- offsetof(struct md_io_clone, bio_clone), 0);
- if (err)
- goto exit_sync_set;
- }
-
- spin_lock(&pers_lock);
- pers = find_pers(mddev->level, mddev->clevel);
- if (!pers || !try_module_get(pers->owner)) {
- spin_unlock(&pers_lock);
- if (mddev->level != LEVEL_NONE)
- pr_warn("md: personality for level %d is not loaded!\n",
- mddev->level);
- else
- pr_warn("md: personality for level %s is not loaded!\n",
- mddev->clevel);
- err = -EINVAL;
- goto abort;
- }
- spin_unlock(&pers_lock);
- if (mddev->level != pers->level) {
- mddev->level = pers->level;
- mddev->new_level = pers->level;
+ pers = get_pers(mddev->level, mddev->clevel);
+ if (!pers)
+ return -EINVAL;
+ if (mddev->level != pers->head.id) {
+ mddev->level = pers->head.id;
+ mddev->new_level = pers->head.id;
}
- strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
- module_put(pers->owner);
- err = -EINVAL;
- goto abort;
+ put_pers(pers);
+ return -EINVAL;
}
if (pers->sync_request) {
@@ -6174,7 +6570,7 @@ int md_run(struct mddev *mddev)
}
if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
- err = mddev->bitmap_ops->create(mddev, -1);
+ err = md_bitmap_create(mddev);
if (err)
pr_warn("%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
@@ -6246,14 +6642,8 @@ bitmap_abort:
if (mddev->private)
pers->free(mddev, mddev->private);
mddev->private = NULL;
- module_put(pers->owner);
- mddev->bitmap_ops->destroy(mddev);
-abort:
- bioset_exit(&mddev->io_clone_set);
-exit_sync_set:
- bioset_exit(&mddev->sync_set);
-exit_bio_set:
- bioset_exit(&mddev->bio_set);
+ put_pers(pers);
+ md_bitmap_destroy(mddev);
return err;
}
EXPORT_SYMBOL_GPL(md_run);
@@ -6267,10 +6657,12 @@ int do_md_run(struct mddev *mddev)
if (err)
goto out;
- err = mddev->bitmap_ops->load(mddev);
- if (err) {
- mddev->bitmap_ops->destroy(mddev);
- goto out;
+ if (md_bitmap_registered(mddev)) {
+ err = mddev->bitmap_ops->load(mddev);
+ if (err) {
+ md_bitmap_destroy(mddev);
+ goto out;
+ }
}
if (mddev_is_clustered(mddev))
@@ -6354,7 +6746,7 @@ static void md_clean(struct mddev *mddev)
mddev->external_size = 0;
mddev->dev_sectors = 0;
mddev->raid_disks = 0;
- mddev->recovery_cp = 0;
+ mddev->resync_offset = 0;
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->reshape_position = MaxSector;
@@ -6362,21 +6754,29 @@ static void md_clean(struct mddev *mddev)
mddev->persistent = 0;
mddev->level = LEVEL_NONE;
mddev->clevel[0] = 0;
+
/*
- * Don't clear MD_CLOSING, or mddev can be opened again.
- * 'hold_active != 0' means mddev is still in the creation
- * process and will be used later.
+ * For legacy_async_del_gendisk mode, it can stop the array in the
+ * middle of assembling it, then it still can access the array. So
+ * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk,
+ * it can't open the array again after stopping it. So it doesn't
+ * clear MD_CLOSING.
*/
- if (mddev->hold_active)
- mddev->flags = 0;
- else
+ if (legacy_async_del_gendisk && mddev->hold_active) {
+ clear_bit(MD_CLOSING, &mddev->flags);
+ } else {
+ /* if UNTIL_STOP is set, it's cleared here */
+ mddev->hold_active = 0;
+ /* Don't clear MD_CLOSING, or mddev can be opened again. */
mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
+ }
mddev->sb_flags = 0;
mddev->ro = MD_RDWR;
mddev->metadata_type[0] = 0;
mddev->chunk_sectors = 0;
mddev->ctime = mddev->utime = 0;
mddev->layout = 0;
+ mddev->logical_block_size = 0;
mddev->max_disks = 0;
mddev->events = 0;
mddev->can_decrease_events = 0;
@@ -6407,14 +6807,15 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
- del_timer_sync(&mddev->safemode_timer);
+ timer_delete_sync(&mddev->safemode_timer);
if (mddev->pers && mddev->pers->quiesce) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
- mddev->bitmap_ops->flush(mddev);
+ if (md_bitmap_enabled(mddev, true))
+ mddev->bitmap_ops->flush(mddev);
if (md_is_rdwr(mddev) &&
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
@@ -6441,7 +6842,8 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
static void mddev_detach(struct mddev *mddev)
{
- mddev->bitmap_ops->wait_behind_writes(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->wait_behind_writes(mddev);
if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
@@ -6457,7 +6859,7 @@ static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
mddev_detach(mddev);
spin_lock(&mddev->lock);
mddev->pers = NULL;
@@ -6465,14 +6867,8 @@ static void __md_stop(struct mddev *mddev)
if (mddev->private)
pers->free(mddev, mddev->private);
mddev->private = NULL;
- if (pers->sync_request && mddev->to_remove == NULL)
- mddev->to_remove = &md_redundancy_group;
- module_put(pers->owner);
+ put_pers(pers);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-
- bioset_exit(&mddev->bio_set);
- bioset_exit(&mddev->sync_set);
- bioset_exit(&mddev->io_clone_set);
}
void md_stop(struct mddev *mddev)
@@ -6563,6 +6959,10 @@ static int do_md_stop(struct mddev *mddev, int mode)
if (!md_is_rdwr(mddev))
set_disk_ro(disk, 0);
+ if (mode == 2 && mddev->pers->sync_request &&
+ mddev->to_remove == NULL)
+ mddev->to_remove = &md_redundancy_group;
+
__md_stop_writes(mddev);
__md_stop(mddev);
@@ -6595,10 +6995,9 @@ static int do_md_stop(struct mddev *mddev, int mode)
mddev->bitmap_info.offset = 0;
export_array(mddev);
-
md_clean(mddev);
- if (mddev->hold_active == UNTIL_STOP)
- mddev->hold_active = 0;
+ if (!legacy_async_del_gendisk)
+ set_bit(MD_DELETED, &mddev->flags);
}
md_new_event();
sysfs_notify_dirent_safe(mddev->sysfs_state);
@@ -6983,7 +7382,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
set_bit(Candidate, &rdev->flags);
else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
/* --add initiated by this node */
- err = md_cluster_ops->add_new_disk(mddev, rdev);
+ err = mddev->cluster_ops->add_new_disk(mddev, rdev);
if (err) {
export_rdev(rdev, mddev);
return err;
@@ -7000,14 +7399,14 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
if (mddev_is_clustered(mddev)) {
if (info->state & (1 << MD_DISK_CANDIDATE)) {
if (!err) {
- err = md_cluster_ops->new_disk_ack(mddev,
- err == 0);
+ err = mddev->cluster_ops->new_disk_ack(
+ mddev, err == 0);
if (err)
md_kick_rdev_from_array(rdev);
}
} else {
if (err)
- md_cluster_ops->add_new_disk_cancel(mddev);
+ mddev->cluster_ops->add_new_disk_cancel(mddev);
else
err = add_bound_rdev(rdev);
}
@@ -7087,10 +7486,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
goto busy;
kick_rdev:
- if (mddev_is_clustered(mddev)) {
- if (md_cluster_ops->remove_disk(mddev, rdev))
- goto busy;
- }
+ if (mddev_is_clustered(mddev) &&
+ mddev->cluster_ops->remove_disk(mddev, rdev))
+ goto busy;
md_kick_rdev_from_array(rdev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
@@ -7179,6 +7577,9 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
{
int err = 0;
+ if (!md_bitmap_registered(mddev))
+ return -EINVAL;
+
if (mddev->pers) {
if (!mddev->pers->quiesce || !mddev->thread)
return -EBUSY;
@@ -7235,16 +7636,16 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
err = 0;
if (mddev->pers) {
if (fd >= 0) {
- err = mddev->bitmap_ops->create(mddev, -1);
+ err = md_bitmap_create(mddev);
if (!err)
err = mddev->bitmap_ops->load(mddev);
if (err) {
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
fd = -1;
}
} else if (fd < 0) {
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
}
}
@@ -7309,9 +7710,9 @@ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
* openned
*/
if (info->state & (1<<MD_SB_CLEAN))
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
else
- mddev->recovery_cp = 0;
+ mddev->resync_offset = 0;
mddev->persistent = ! info->not_persistent;
mddev->external = 0;
@@ -7393,7 +7794,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
rv = mddev->pers->resize(mddev, num_sectors);
if (!rv) {
if (mddev_is_clustered(mddev))
- md_cluster_ops->update_size(mddev, old_dev_sectors);
+ mddev->cluster_ops->update_size(mddev, old_dev_sectors);
else if (!mddev_is_dm(mddev))
set_capacity_and_notify(mddev->gendisk,
mddev->array_sectors);
@@ -7441,6 +7842,28 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
return rv;
}
+static int get_cluster_ops(struct mddev *mddev)
+{
+ xa_lock(&md_submodule);
+ mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER);
+ if (mddev->cluster_ops &&
+ !try_module_get(mddev->cluster_ops->head.owner))
+ mddev->cluster_ops = NULL;
+ xa_unlock(&md_submodule);
+
+ return mddev->cluster_ops == NULL ? -ENOENT : 0;
+}
+
+static void put_cluster_ops(struct mddev *mddev)
+{
+ if (!mddev->cluster_ops)
+ return;
+
+ mddev->cluster_ops->leave(mddev);
+ module_put(mddev->cluster_ops->head.owner);
+ mddev->cluster_ops = NULL;
+}
+
/*
* update_array_info is used to change the configuration of an
* on-line array.
@@ -7529,12 +7952,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
- rv = mddev->bitmap_ops->create(mddev, -1);
+ rv = md_bitmap_create(mddev);
if (!rv)
rv = mddev->bitmap_ops->load(mddev);
if (rv)
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
} else {
struct md_bitmap_stats stats;
@@ -7549,19 +7972,18 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
if (mddev->bitmap_info.nodes) {
/* hold PW on all the bitmap lock */
- if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
+ if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) {
pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
rv = -EPERM;
- md_cluster_ops->unlock_all_bitmaps(mddev);
+ mddev->cluster_ops->unlock_all_bitmaps(mddev);
goto err;
}
mddev->bitmap_info.nodes = 0;
- md_cluster_ops->leave(mddev);
- module_put(md_cluster_mod);
+ put_cluster_ops(mddev);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
mddev->bitmap_info.offset = 0;
}
}
@@ -7598,9 +8020,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
* 4 sectors (with a BIG number of cylinders...). This drives
* dosfs just mad... ;-)
*/
-static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct mddev *mddev = bdev->bd_disk->private_data;
+ struct mddev *mddev = disk->private_data;
geo->heads = 2;
geo->sectors = 4;
@@ -7842,7 +8264,7 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
case CLUSTERED_DISK_NACK:
if (mddev_is_clustered(mddev))
- md_cluster_ops->new_disk_ack(mddev, false);
+ mddev->cluster_ops->new_disk_ack(mddev, false);
else
err = -EINVAL;
goto unlock;
@@ -8045,22 +8467,21 @@ static int md_thread(void *arg)
return 0;
}
-static void md_wakeup_thread_directly(struct md_thread __rcu *thread)
+static void md_wakeup_thread_directly(struct md_thread __rcu **thread)
{
struct md_thread *t;
rcu_read_lock();
- t = rcu_dereference(thread);
+ t = rcu_dereference(*thread);
if (t)
wake_up_process(t->tsk);
rcu_read_unlock();
}
-void md_wakeup_thread(struct md_thread __rcu *thread)
+void __md_wakeup_thread(struct md_thread __rcu *thread)
{
struct md_thread *t;
- rcu_read_lock();
t = rcu_dereference(thread);
if (t) {
pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
@@ -8068,9 +8489,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread)
if (wq_has_sleeper(&t->wqueue))
wake_up(&t->wqueue);
}
- rcu_read_unlock();
}
-EXPORT_SYMBOL(md_wakeup_thread);
+EXPORT_SYMBOL(__md_wakeup_thread);
struct md_thread *md_register_thread(void (*run) (struct md_thread *),
struct mddev *mddev, const char *name)
@@ -8124,7 +8544,8 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
return;
mddev->pers->error_handler(mddev, rdev);
- if (mddev->pers->level == 0)
+ if (mddev->pers->head.id == ID_RAID0 ||
+ mddev->pers->head.id == ID_LINEAR)
return;
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
@@ -8162,14 +8583,17 @@ static void status_unused(struct seq_file *seq)
static void status_personalities(struct seq_file *seq)
{
- struct md_personality *pers;
+ struct md_submodule_head *head;
+ unsigned long i;
seq_puts(seq, "Personalities : ");
- spin_lock(&pers_lock);
- list_for_each_entry(pers, &pers_list, list)
- seq_printf(seq, "[%s] ", pers->name);
- spin_unlock(&pers_lock);
+ xa_lock(&md_submodule);
+ xa_for_each(&md_submodule, i, head)
+ if (head->type == MD_PERSONALITY)
+ seq_printf(seq, "[%s] ", head->name);
+ xa_unlock(&md_submodule);
+
seq_puts(seq, "\n");
}
@@ -8225,7 +8649,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, "\tresync=REMOTE");
return 1;
}
- if (mddev->recovery_cp < MaxSector) {
+ if (mddev->resync_offset < MaxSector) {
seq_printf(seq, "\tresync=PENDING");
return 1;
}
@@ -8338,6 +8762,9 @@ static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
unsigned long chunk_kb;
int err;
+ if (!md_bitmap_enabled(mddev, false))
+ return;
+
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (err)
return;
@@ -8376,6 +8803,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
spin_unlock(&all_mddevs_lock);
+
+ /* prevent bitmap to be freed after checking */
+ mutex_lock(&mddev->bitmap_info.mutex);
+
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : ", mdname(mddev));
@@ -8388,7 +8819,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, " (read-only)");
if (mddev->ro == MD_AUTO_READ)
seq_printf(seq, " (auto-read-only)");
- seq_printf(seq, " %s", mddev->pers->name);
+ seq_printf(seq, " %s", mddev->pers->head.name);
} else {
seq_printf(seq, "inactive");
}
@@ -8451,14 +8882,13 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ mutex_unlock(&mddev->bitmap_info.mutex);
spin_lock(&all_mddevs_lock);
if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
status_unused(seq);
- if (atomic_dec_and_test(&mddev->active))
- __mddev_put(mddev);
-
+ mddev_put_locked(mddev);
return 0;
}
@@ -8509,67 +8939,34 @@ static const struct proc_ops mdstat_proc_ops = {
.proc_poll = mdstat_poll,
};
-int register_md_personality(struct md_personality *p)
-{
- pr_debug("md: %s personality registered for level %d\n",
- p->name, p->level);
- spin_lock(&pers_lock);
- list_add_tail(&p->list, &pers_list);
- spin_unlock(&pers_lock);
- return 0;
-}
-EXPORT_SYMBOL(register_md_personality);
-
-int unregister_md_personality(struct md_personality *p)
-{
- pr_debug("md: %s personality unregistered\n", p->name);
- spin_lock(&pers_lock);
- list_del_init(&p->list);
- spin_unlock(&pers_lock);
- return 0;
-}
-EXPORT_SYMBOL(unregister_md_personality);
-
-int register_md_cluster_operations(const struct md_cluster_operations *ops,
- struct module *module)
+int register_md_submodule(struct md_submodule_head *msh)
{
- int ret = 0;
- spin_lock(&pers_lock);
- if (md_cluster_ops != NULL)
- ret = -EALREADY;
- else {
- md_cluster_ops = ops;
- md_cluster_mod = module;
- }
- spin_unlock(&pers_lock);
- return ret;
+ return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL);
}
-EXPORT_SYMBOL(register_md_cluster_operations);
+EXPORT_SYMBOL_GPL(register_md_submodule);
-int unregister_md_cluster_operations(void)
+void unregister_md_submodule(struct md_submodule_head *msh)
{
- spin_lock(&pers_lock);
- md_cluster_ops = NULL;
- spin_unlock(&pers_lock);
- return 0;
+ xa_erase(&md_submodule, msh->id);
}
-EXPORT_SYMBOL(unregister_md_cluster_operations);
+EXPORT_SYMBOL_GPL(unregister_md_submodule);
int md_setup_cluster(struct mddev *mddev, int nodes)
{
- int ret;
- if (!md_cluster_ops)
+ int ret = get_cluster_ops(mddev);
+
+ if (ret) {
request_module("md-cluster");
- spin_lock(&pers_lock);
+ ret = get_cluster_ops(mddev);
+ }
+
/* ensure module won't be unloaded */
- if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+ if (ret) {
pr_warn("can't find md-cluster module or get its reference.\n");
- spin_unlock(&pers_lock);
- return -ENOENT;
+ return ret;
}
- spin_unlock(&pers_lock);
- ret = md_cluster_ops->join(mddev, nodes);
+ ret = mddev->cluster_ops->join(mddev, nodes);
if (!ret)
mddev->safemode_delay = 0;
return ret;
@@ -8577,56 +8974,58 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
void md_cluster_stop(struct mddev *mddev)
{
- if (!md_cluster_ops)
- return;
- md_cluster_ops->leave(mddev);
- module_put(md_cluster_mod);
+ put_cluster_ops(mddev);
}
-static int is_mddev_idle(struct mddev *mddev, int init)
+static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
{
+ unsigned long last_events = rdev->last_events;
+
+ if (!bdev_is_partition(rdev->bdev))
+ return true;
+
+ /*
+ * If rdev is partition, and user doesn't issue IO to the array, the
+ * array is still not idle if user issues IO to other partitions.
+ */
+ rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
+ sectors) -
+ part_stat_read_accum(rdev->bdev, sectors);
+
+ return init || rdev->last_events <= last_events;
+}
+
+/*
+ * mddev is idle if following conditions are matched since last check:
+ * 1) mddev doesn't have normal IO completed;
+ * 2) mddev doesn't have inflight normal IO;
+ * 3) if any member disk is partition, and other partitions don't have IO
+ * completed;
+ *
+ * Noted this checking rely on IO accounting is enabled.
+ */
+static bool is_mddev_idle(struct mddev *mddev, int init)
+{
+ unsigned long last_events = mddev->normal_io_events;
+ struct gendisk *disk;
struct md_rdev *rdev;
- int idle;
- int curr_events;
+ bool idle = true;
- idle = 1;
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev) {
- struct gendisk *disk = rdev->bdev->bd_disk;
+ disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
+ if (!disk)
+ return true;
- if (!init && !blk_queue_io_stat(disk->queue))
- continue;
+ mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
+ if (!init && (mddev->normal_io_events > last_events ||
+ bdev_count_inflight(disk->part0)))
+ idle = false;
- curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
- atomic_read(&disk->sync_io);
- /* sync IO will cause sync_io to increase before the disk_stats
- * as sync_io is counted when a request starts, and
- * disk_stats is counted when it completes.
- * So resync activity will cause curr_events to be smaller than
- * when there was no such activity.
- * non-sync IO will cause disk_stat to increase without
- * increasing sync_io so curr_events will (eventually)
- * be larger than it was before. Once it becomes
- * substantially larger, the test below will cause
- * the array to appear non-idle, and resync will slow
- * down.
- * If there is a lot of outstanding resync activity when
- * we set last_event to curr_events, then all that activity
- * completing might cause the array to appear non-idle
- * and resync will be slowed down even though there might
- * not have been non-resync activity. This will only
- * happen once though. 'last_events' will soon reflect
- * the state where there is little or no outstanding
- * resync requests, and further resync activity will
- * always make curr_events less than last_events.
- *
- */
- if (init || curr_events - rdev->last_events > 64) {
- rdev->last_events = curr_events;
- idle = 0;
- }
- }
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (!is_rdev_holder_idle(rdev, init))
+ idle = false;
rcu_read_unlock();
+
return idle;
}
@@ -8745,12 +9144,38 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
+static void md_bitmap_start(struct mddev *mddev,
+ struct md_io_clone *md_io_clone)
+{
+ md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ?
+ mddev->bitmap_ops->start_discard :
+ mddev->bitmap_ops->start_write;
+
+ if (mddev->pers->bitmap_sector)
+ mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
+ &md_io_clone->sectors);
+
+ fn(mddev, md_io_clone->offset, md_io_clone->sectors);
+}
+
+static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
+{
+ md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ?
+ mddev->bitmap_ops->end_discard :
+ mddev->bitmap_ops->end_write;
+
+ fn(mddev, md_io_clone->offset, md_io_clone->sectors);
+}
+
static void md_end_clone_io(struct bio *bio)
{
struct md_io_clone *md_io_clone = bio->bi_private;
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
+ if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
+ md_bitmap_end(mddev, md_io_clone);
+
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
@@ -8775,6 +9200,13 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
if (blk_queue_io_stat(bdev->bd_disk->queue))
md_io_clone->start_time = bio_start_io_acct(*bio);
+ if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) {
+ md_io_clone->offset = (*bio)->bi_iter.bi_sector;
+ md_io_clone->sectors = bio_sectors(*bio);
+ md_io_clone->rw = op_stat_group(bio_op(*bio));
+ md_bitmap_start(mddev, md_io_clone);
+ }
+
clone->bi_end_io = md_end_clone_io;
clone->bi_private = md_io_clone;
*bio = clone;
@@ -8793,6 +9225,9 @@ void md_free_cloned_bio(struct bio *bio)
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
+ if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
+ md_bitmap_end(mddev, md_io_clone);
+
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
@@ -8856,6 +9291,39 @@ static sector_t md_sync_max_sectors(struct mddev *mddev,
}
}
+/*
+ * If lazy recovery is requested and all rdevs are in sync, select the rdev with
+ * the higest index to perfore recovery to build initial xor data, this is the
+ * same as old bitmap.
+ */
+static bool mddev_select_lazy_recover_rdev(struct mddev *mddev)
+{
+ struct md_rdev *recover_rdev = NULL;
+ struct md_rdev *rdev;
+ bool ret = false;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev->raid_disk < 0)
+ continue;
+
+ if (test_bit(Faulty, &rdev->flags) ||
+ !test_bit(In_sync, &rdev->flags))
+ break;
+
+ if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk)
+ recover_rdev = rdev;
+ }
+
+ if (recover_rdev) {
+ clear_bit(In_sync, &recover_rdev->flags);
+ ret = true;
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
{
sector_t start = 0;
@@ -8867,7 +9335,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
return mddev->resync_min;
case ACTION_RESYNC:
if (!mddev->bitmap)
- return mddev->recovery_cp;
+ return mddev->resync_offset;
return 0;
case ACTION_RESHAPE:
/*
@@ -8883,14 +9351,18 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
start = MaxSector;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
- if (rdev->raid_disk >= 0 &&
- !test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags) &&
- !test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset < start)
+ if (rdev_needs_recovery(rdev, start))
start = rdev->recovery_offset;
rcu_read_unlock();
+ /*
+ * If there are no spares, and raid456 lazy initial recover is
+ * requested.
+ */
+ if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) &&
+ start == MaxSector && mddev_select_lazy_recover_rdev(mddev))
+ start = 0;
+
/* If there is a bitmap, we need to make sure all
* writes that started before we added a spare
* complete before we start doing a recovery.
@@ -8909,6 +9381,16 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
}
}
+static bool sync_io_within_limit(struct mddev *mddev)
+{
+ /*
+ * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
+ * RESYNC_PAGES(64k) per IO.
+ */
+ return atomic_read(&mddev->recovery_active) <
+ (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev);
+}
+
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
#define UPDATE_FREQUENCY (5*60*HZ)
@@ -8944,7 +9426,7 @@ void md_do_sync(struct md_thread *thread)
}
if (mddev_is_clustered(mddev)) {
- ret = md_cluster_ops->resync_start(mddev);
+ ret = mddev->cluster_ops->resync_start(mddev);
if (ret)
goto skip;
@@ -8958,6 +9440,11 @@ void md_do_sync(struct md_thread *thread)
}
action = md_sync_action(mddev);
+ if (action == ACTION_FROZEN || action == ACTION_IDLE) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ goto skip;
+ }
+
desc = md_sync_action_name(action);
mddev->last_sync_action = action;
@@ -8971,7 +9458,7 @@ void md_do_sync(struct md_thread *thread)
*
*/
if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_start_notify(mddev);
+ mddev->cluster_ops->resync_start_notify(mddev);
do {
int mddev2_minor = -1;
mddev->curr_resync = MD_RESYNC_DELAYED;
@@ -9088,8 +9575,8 @@ void md_do_sync(struct md_thread *thread)
atomic_read(&mddev->recovery_active) == 0);
mddev->curr_resync_completed = j;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
- j > mddev->recovery_cp)
- mddev->recovery_cp = j;
+ j > mddev->resync_offset)
+ mddev->resync_offset = j;
update_time = jiffies;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
@@ -9111,6 +9598,12 @@ void md_do_sync(struct md_thread *thread)
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
+ if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) {
+ sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j);
+ if (sectors)
+ goto update;
+ }
+
sectors = mddev->pers->sync_request(mddev, j, max_sectors,
&skipped);
if (sectors == 0) {
@@ -9126,6 +9619,7 @@ void md_do_sync(struct md_thread *thread)
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
+update:
j += sectors;
if (j > max_sectors)
/* when skipping, extra large numbers can be returned. */
@@ -9177,7 +9671,8 @@ void md_do_sync(struct md_thread *thread)
msleep(500);
goto repeat;
}
- if (!is_mddev_idle(mddev, 0)) {
+ if (!sync_io_within_limit(mddev) &&
+ !is_mddev_idle(mddev, 0)) {
/*
* Give other IO more of a chance.
* The faster the devices, the less we wait.
@@ -9208,19 +9703,19 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync > MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
- if (mddev->curr_resync >= mddev->recovery_cp) {
+ if (mddev->curr_resync >= mddev->resync_offset) {
pr_debug("md: checkpointing %s of %s.\n",
desc, mdname(mddev));
if (test_bit(MD_RECOVERY_ERROR,
&mddev->recovery))
- mddev->recovery_cp =
+ mddev->resync_offset =
mddev->curr_resync_completed;
else
- mddev->recovery_cp =
+ mddev->resync_offset =
mddev->curr_resync;
}
} else
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
} else {
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector;
@@ -9228,12 +9723,8 @@ void md_do_sync(struct md_thread *thread)
test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
- if (rdev->raid_disk >= 0 &&
- mddev->delta_disks >= 0 &&
- !test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags) &&
- !test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset < mddev->curr_resync)
+ if (mddev->delta_disks >= 0 &&
+ rdev_needs_recovery(rdev, mddev->curr_resync))
rdev->recovery_offset = mddev->curr_resync;
rcu_read_unlock();
}
@@ -9324,6 +9815,12 @@ static bool rdev_is_spare(struct md_rdev *rdev)
static bool rdev_addable(struct md_rdev *rdev)
{
+ struct mddev *mddev;
+
+ mddev = READ_ONCE(rdev->mddev);
+ if (!mddev)
+ return false;
+
/* rdev is already used, don't add it again. */
if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
test_bit(Faulty, &rdev->flags))
@@ -9334,7 +9831,7 @@ static bool rdev_addable(struct md_rdev *rdev)
return true;
/* Allow to add if array is read-write. */
- if (md_is_rdwr(rdev->mddev))
+ if (md_is_rdwr(mddev))
return true;
/*
@@ -9362,17 +9859,11 @@ static bool md_spares_need_change(struct mddev *mddev)
return false;
}
-static int remove_and_add_spares(struct mddev *mddev,
- struct md_rdev *this)
+static int remove_spares(struct mddev *mddev, struct md_rdev *this)
{
struct md_rdev *rdev;
- int spares = 0;
int removed = 0;
- if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- /* Mustn't remove devices when resync thread is running */
- return 0;
-
rdev_for_each(rdev, mddev) {
if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
!mddev->pers->hot_remove_disk(mddev, rdev)) {
@@ -9386,6 +9877,21 @@ static int remove_and_add_spares(struct mddev *mddev,
if (removed && mddev->kobj.sd)
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
+ return removed;
+}
+
+static int remove_and_add_spares(struct mddev *mddev,
+ struct md_rdev *this)
+{
+ struct md_rdev *rdev;
+ int spares = 0;
+ int removed = 0;
+
+ if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ /* Mustn't remove devices when resync thread is running */
+ return 0;
+
+ removed = remove_spares(mddev, this);
if (this && removed)
goto no_add;
@@ -9423,6 +9929,16 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
+ /* Check if resync is in progress. */
+ if (mddev->resync_offset < MaxSector) {
+ remove_spares(mddev, NULL);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
return true;
}
@@ -9432,7 +9948,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
* re-add.
*/
*spares = remove_and_add_spares(mddev, NULL);
- if (*spares) {
+ if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) {
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
@@ -9442,13 +9958,6 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
return true;
}
- /* Check if recovery is in progress. */
- if (mddev->recovery_cp < MaxSector) {
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- return true;
- }
-
/* Delay to choose resync/check/repair in md_do_sync(). */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
return true;
@@ -9497,7 +10006,7 @@ static void md_start_sync(struct work_struct *ws)
* We are adding a device or devices to an array which has the bitmap
* stored on all devices. So make sure all bitmap pages get written.
*/
- if (spares)
+ if (spares && md_bitmap_enabled(mddev, true))
mddev->bitmap_ops->write_all(mddev);
name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
@@ -9561,6 +10070,52 @@ static void unregister_sync_thread(struct mddev *mddev)
md_reap_sync_thread(mddev);
}
+static bool md_should_do_recovery(struct mddev *mddev)
+{
+ /*
+ * As long as one of the following flags is set,
+ * recovery needs to do or cleanup.
+ */
+ if (test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ return true;
+
+ /*
+ * If no flags are set and it is in read-only status,
+ * there is nothing to do.
+ */
+ if (!md_is_rdwr(mddev))
+ return false;
+
+ /*
+ * MD_SB_CHANGE_PENDING indicates that the array is switching from clean to
+ * active, and no action is needed for now.
+ * All other MD_SB_* flags require to update the superblock.
+ */
+ if (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING))
+ return true;
+
+ /*
+ * If the array is not using external metadata and there has been no data
+ * written for some time, then the array's status needs to be set to
+ * in_sync.
+ */
+ if (mddev->external == 0 && mddev->safemode == 1)
+ return true;
+
+ /*
+ * When the system is about to restart or the process receives an signal,
+ * the array needs to be synchronized as soon as possible.
+ * Once the data synchronization is completed, need to change the array
+ * status to in_sync.
+ */
+ if (mddev->safemode == 2 && !mddev->in_sync &&
+ mddev->resync_offset == MaxSector)
+ return true;
+
+ return false;
+}
+
/*
* This routine is regularly called by all per-raid-array threads to
* deal with generic issues like resync and super-block update.
@@ -9585,7 +10140,7 @@ static void unregister_sync_thread(struct mddev *mddev)
*/
void md_check_recovery(struct mddev *mddev)
{
- if (mddev->bitmap)
+ if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work)
mddev->bitmap_ops->daemon_work(mddev);
if (signal_pending(current)) {
@@ -9597,18 +10152,7 @@ void md_check_recovery(struct mddev *mddev)
flush_signals(current);
}
- if (!md_is_rdwr(mddev) &&
- !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
- return;
- if ( ! (
- (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
- test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
- test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
- (mddev->external == 0 && mddev->safemode == 1) ||
- (mddev->safemode == 2
- && !mddev->in_sync && mddev->recovery_cp == MaxSector)
- ))
+ if (!md_should_do_recovery(mddev))
return;
if (mddev_trylock(mddev)) {
@@ -9652,6 +10196,7 @@ void md_check_recovery(struct mddev *mddev)
}
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
@@ -9664,8 +10209,8 @@ void md_check_recovery(struct mddev *mddev)
* remove disk.
*/
rdev_for_each_safe(rdev, tmp, mddev) {
- if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
- rdev->raid_disk < 0)
+ if (rdev->raid_disk < 0 &&
+ test_and_clear_bit(ClusterRemove, &rdev->flags))
md_kick_rdev_from_array(rdev);
}
}
@@ -9755,21 +10300,22 @@ void md_reap_sync_thread(struct mddev *mddev)
* call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
* clustered raid */
if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
- md_cluster_ops->resync_finish(mddev);
+ mddev->cluster_ops->resync_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
/*
- * We call md_cluster_ops->update_size here because sync_size could
+ * We call mddev->cluster_ops->update_size here because sync_size could
* be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
* so it is time to update size across cluster.
*/
if (mddev_is_clustered(mddev) && is_reshaped
&& !test_bit(MD_CLOSING, &mddev->flags))
- md_cluster_ops->update_size(mddev, old_dev_sectors);
+ mddev->cluster_ops->update_size(mddev, old_dev_sectors);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
@@ -9807,12 +10353,11 @@ EXPORT_SYMBOL(md_finish_reshape);
/* Bad block management */
-/* Returns 1 on success, 0 on failure */
-int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new)
+/* Returns true on success, false on failure */
+bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new)
{
struct mddev *mddev = rdev->mddev;
- int rv;
/*
* Recording new badblocks for faulty rdev will force unnecessary
@@ -9822,50 +10367,50 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
* avoid it.
*/
if (test_bit(Faulty, &rdev->flags))
- return 1;
+ return true;
if (is_new)
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
- if (rv == 0) {
- /* Make sure they get written out promptly */
- if (test_bit(ExternalBbl, &rdev->flags))
- sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
- sysfs_notify_dirent_safe(rdev->sysfs_state);
- set_mask_bits(&mddev->sb_flags, 0,
- BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
- md_wakeup_thread(rdev->mddev->thread);
- return 1;
- } else
- return 0;
+
+ if (!badblocks_set(&rdev->badblocks, s, sectors, 0))
+ return false;
+
+ /* Make sure they get written out promptly */
+ if (test_bit(ExternalBbl, &rdev->flags))
+ sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ set_mask_bits(&mddev->sb_flags, 0,
+ BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
+ md_wakeup_thread(rdev->mddev->thread);
+ return true;
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);
-int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new)
+void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new)
{
- int rv;
if (is_new)
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- rv = badblocks_clear(&rdev->badblocks, s, sectors);
- if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
+
+ if (!badblocks_clear(&rdev->badblocks, s, sectors))
+ return;
+
+ if (test_bit(ExternalBbl, &rdev->flags))
sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
- return rv;
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
- struct mddev *mddev, *n;
- int need_delay = 0;
+ struct mddev *mddev;
spin_lock(&all_mddevs_lock);
- list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
if (!mddev_get(mddev))
continue;
spin_unlock(&all_mddevs_lock);
@@ -9876,21 +10421,11 @@ static int md_notify_reboot(struct notifier_block *this,
mddev->safemode = 2;
mddev_unlock(mddev);
}
- need_delay = 1;
- mddev_put(mddev);
spin_lock(&all_mddevs_lock);
+ mddev_put_locked(mddev);
}
spin_unlock(&all_mddevs_lock);
- /*
- * certain more exotic SCSI devices are known to be
- * volatile wrt too early system reboots. While the
- * right place to handle this issue is the given
- * driver, we do want to have a safe RAID driver ...
- */
- if (need_delay)
- msleep(1000);
-
return NOTIFY_DONE;
}
@@ -9909,8 +10444,16 @@ static void md_geninit(void)
static int __init md_init(void)
{
- int ret = -ENOMEM;
+ int ret = md_bitmap_init();
+
+ if (ret)
+ return ret;
+
+ ret = md_llbitmap_init();
+ if (ret)
+ goto err_bitmap;
+ ret = -ENOMEM;
md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
if (!md_wq)
goto err_wq;
@@ -9919,11 +10462,6 @@ static int __init md_init(void)
if (!md_misc_wq)
goto err_misc_wq;
- md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
- 0);
- if (!md_bitmap_wq)
- goto err_bitmap_wq;
-
ret = __register_blkdev(MD_MAJOR, "md", md_probe);
if (ret < 0)
goto err_md;
@@ -9942,12 +10480,13 @@ static int __init md_init(void)
err_mdp:
unregister_blkdev(MD_MAJOR, "md");
err_md:
- destroy_workqueue(md_bitmap_wq);
-err_bitmap_wq:
destroy_workqueue(md_misc_wq);
err_misc_wq:
destroy_workqueue(md_wq);
err_wq:
+ md_llbitmap_exit();
+err_bitmap:
+ md_bitmap_exit();
return ret;
}
@@ -9965,14 +10504,17 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
if (ret)
pr_info("md-cluster: resize failed\n");
- else
+ else if (md_bitmap_enabled(mddev, false))
mddev->bitmap_ops->update_sb(mddev->bitmap);
}
/* Check for change of roles in the active devices */
rdev_for_each_safe(rdev2, tmp, mddev) {
- if (test_bit(Faulty, &rdev2->flags))
+ if (test_bit(Faulty, &rdev2->flags)) {
+ if (test_bit(ClusterRemove, &rdev2->flags))
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
continue;
+ }
/* Check if the roles changed */
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
@@ -9995,7 +10537,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE) &&
- !md_cluster_ops->resync_status_get(mddev)) {
+ !mddev->cluster_ops->resync_status_get(mddev)) {
/*
* -1 to make raid1_add_disk() set conf->fullsync
* to 1. This could avoid skipping sync when the
@@ -10211,7 +10753,7 @@ void md_autostart_arrays(int part)
static __exit void md_exit(void)
{
- struct mddev *mddev, *n;
+ struct mddev *mddev;
int delay = 1;
unregister_blkdev(MD_MAJOR,"md");
@@ -10232,7 +10774,7 @@ static __exit void md_exit(void)
remove_proc_entry("mdstat", NULL);
spin_lock(&all_mddevs_lock);
- list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
if (!mddev_get(mddev))
continue;
spin_unlock(&all_mddevs_lock);
@@ -10244,14 +10786,14 @@ static __exit void md_exit(void)
* the mddev for destruction by a workqueue, and the
* destroy_workqueue() below will wait for that to complete.
*/
- mddev_put(mddev);
spin_lock(&all_mddevs_lock);
+ mddev_put_locked(mddev);
}
spin_unlock(&all_mddevs_lock);
destroy_workqueue(md_misc_wq);
- destroy_workqueue(md_bitmap_wq);
destroy_workqueue(md_wq);
+ md_bitmap_exit();
}
subsys_initcall(md_init);
@@ -10270,6 +10812,8 @@ module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
+module_param(legacy_async_del_gendisk, bool, 0600);
+module_param(check_new_feature, bool, 0600);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MD RAID framework");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4ba93af36126..6985f2829bbd 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -18,11 +18,38 @@
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
+#include <linux/raid/md_u.h>
#include <trace/events/block.h>
-#include "md-cluster.h"
#define MaxSector (~(sector_t)0)
+enum md_submodule_type {
+ MD_PERSONALITY = 0,
+ MD_CLUSTER,
+ MD_BITMAP,
+};
+
+enum md_submodule_id {
+ ID_LINEAR = LEVEL_LINEAR,
+ ID_RAID0 = 0,
+ ID_RAID1 = 1,
+ ID_RAID4 = 4,
+ ID_RAID5 = 5,
+ ID_RAID6 = 6,
+ ID_RAID10 = 10,
+ ID_CLUSTER,
+ ID_BITMAP,
+ ID_LLBITMAP,
+ ID_BITMAP_NONE,
+};
+
+struct md_submodule_head {
+ enum md_submodule_type type;
+ enum md_submodule_id id;
+ const char *name;
+ struct module *owner;
+};
+
/*
* These flags should really be called "NO_RETRY" rather than
* "FAILFAST" because they don't make any promise about time lapse,
@@ -106,7 +133,7 @@ struct md_rdev {
sector_t sectors; /* Device size (in 512bytes sectors) */
struct mddev *mddev; /* RAID array if running */
- int last_events; /* IO event timestamp */
+ unsigned long last_events; /* IO event timestamp */
/*
* If meta_bdev is non-NULL, it means that a separate device is
@@ -266,8 +293,8 @@ enum flag_bits {
Nonrot, /* non-rotational device (SSD) */
};
-static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
+static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors,
+ sector_t *first_bad, sector_t *bad_sectors)
{
if (unlikely(rdev->badblocks.count)) {
int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
@@ -284,16 +311,17 @@ static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
int sectors)
{
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors);
}
-extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new);
-extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new);
+extern bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new);
+extern void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new);
struct md_cluster_info;
+struct md_cluster_operations;
/**
* enum mddev_flags - md device flags.
@@ -326,6 +354,7 @@ enum mddev_flags {
MD_HAS_MULTIPLE_PPLS,
MD_NOT_READY,
MD_BROKEN,
+ MD_DO_DELETE,
MD_DELETED,
};
@@ -377,7 +406,8 @@ struct mddev {
* are happening, so run/
* takeover/stop are not safe
*/
- struct gendisk *gendisk;
+ struct gendisk *gendisk; /* mdraid gendisk */
+ struct gendisk *dm_gendisk; /* dm-raid gendisk */
struct kobject kobj;
int hold_active;
@@ -403,6 +433,7 @@ struct mddev {
sector_t array_sectors; /* exported array size */
int external_size; /* size managed
* externally */
+ unsigned int logical_block_size;
__u64 events;
/* If the last 'event' was simply a clean->dirty transition, and
* we didn't write it to the spares, then it is safe and simple
@@ -456,6 +487,7 @@ struct mddev {
/* if zero, use the system-wide default */
int sync_speed_min;
int sync_speed_max;
+ int sync_io_depth;
/* resync even though the same disks are shared among md-devices */
int parallel_resync;
@@ -491,9 +523,10 @@ struct mddev {
* adding a spare
*/
+ unsigned long normal_io_events; /* IO event timestamp */
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
- sector_t recovery_cp;
+ sector_t resync_offset;
sector_t resync_min; /* user requested sync
* starts here */
sector_t resync_max; /* resync should pause
@@ -535,6 +568,7 @@ struct mddev {
struct percpu_ref writes_pending;
int sync_checkers; /* # of threads checking writes_pending */
+ enum md_submodule_id bitmap_id;
void *bitmap; /* the bitmap for the device */
struct bitmap_operations *bitmap_ops;
struct {
@@ -576,6 +610,7 @@ struct mddev {
mempool_t *serial_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
+ struct md_cluster_operations *cluster_ops;
unsigned int good_device_nr; /* good device num within cluster raid */
unsigned int noio_flag; /* for memalloc scope API */
@@ -634,6 +669,8 @@ enum recovery_flags {
MD_RECOVERY_RESHAPE,
/* remote node is running resync thread */
MD_RESYNCING_REMOTE,
+ /* raid456 lazy initial recover */
+ MD_RECOVERY_LAZY_RECOVER,
};
enum md_ro_state {
@@ -669,11 +706,26 @@ static inline bool reshape_interrupted(struct mddev *mddev)
static inline int __must_check mddev_lock(struct mddev *mddev)
{
- return mutex_lock_interruptible(&mddev->reconfig_mutex);
+ int ret;
+
+ ret = mutex_lock_interruptible(&mddev->reconfig_mutex);
+
+ /* MD_DELETED is set in do_md_stop with reconfig_mutex.
+ * So check it here.
+ */
+ if (!ret && test_bit(MD_DELETED, &mddev->flags)) {
+ ret = -ENODEV;
+ mutex_unlock(&mddev->reconfig_mutex);
+ }
+
+ return ret;
}
/* Sometimes we need to take the lock in a situation where
* failure due to interrupts is not acceptable.
+ * It doesn't need to check MD_DELETED here, the owner which
+ * holds the lock here can't be stopped. And all paths can't
+ * call this function after do_md_stop.
*/
static inline void mddev_lock_nointr(struct mddev *mddev)
{
@@ -682,27 +734,21 @@ static inline void mddev_lock_nointr(struct mddev *mddev)
static inline int mddev_trylock(struct mddev *mddev)
{
- return mutex_trylock(&mddev->reconfig_mutex);
-}
-extern void mddev_unlock(struct mddev *mddev);
-
-static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
-{
- if (blk_queue_io_stat(bdev->bd_disk->queue))
- atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
-}
+ int ret;
-static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
-{
- md_sync_acct(bio->bi_bdev, nr_sectors);
+ ret = mutex_trylock(&mddev->reconfig_mutex);
+ if (!ret && test_bit(MD_DELETED, &mddev->flags)) {
+ ret = -ENODEV;
+ mutex_unlock(&mddev->reconfig_mutex);
+ }
+ return ret;
}
+extern void mddev_unlock(struct mddev *mddev);
struct md_personality
{
- char *name;
- int level;
- struct list_head list;
- struct module *owner;
+ struct md_submodule_head head;
+
bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio);
/*
* start up works that do NOT require md_thread. tasks that
@@ -746,6 +792,9 @@ struct md_personality
void *(*takeover) (struct mddev *mddev);
/* Changes the consistency policy of an active array. */
int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
+ /* convert io ranges from array to bitmap */
+ void (*bitmap_sector)(struct mddev *mddev, sector_t *offset,
+ unsigned long *sectors);
};
struct md_sysfs_entry {
@@ -753,7 +802,6 @@ struct md_sysfs_entry {
ssize_t (*show)(struct mddev *, char *);
ssize_t (*store)(struct mddev *, const char *, size_t);
};
-extern const struct attribute_group md_bitmap_group;
static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
{
@@ -828,29 +876,34 @@ struct md_io_clone {
struct mddev *mddev;
struct bio *orig_bio;
unsigned long start_time;
+ sector_t offset;
+ unsigned long sectors;
+ enum stat_group rw;
struct bio bio_clone;
};
#define THREAD_WAKEUP 0
+#define md_wakeup_thread(thread) do { \
+ rcu_read_lock(); \
+ __md_wakeup_thread(thread); \
+ rcu_read_unlock(); \
+} while (0)
+
static inline void safe_put_page(struct page *p)
{
if (p) put_page(p);
}
-extern int register_md_personality(struct md_personality *p);
-extern int unregister_md_personality(struct md_personality *p);
-extern int register_md_cluster_operations(const struct md_cluster_operations *ops,
- struct module *module);
-extern int unregister_md_cluster_operations(void);
-extern int md_setup_cluster(struct mddev *mddev, int nodes);
-extern void md_cluster_stop(struct mddev *mddev);
+int register_md_submodule(struct md_submodule_head *msh);
+void unregister_md_submodule(struct md_submodule_head *msh);
+
extern struct md_thread *md_register_thread(
void (*run)(struct md_thread *thread),
struct mddev *mddev,
const char *name);
extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp);
-extern void md_wakeup_thread(struct md_thread __rcu *thread);
+extern void __md_wakeup_thread(struct md_thread __rcu *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
extern enum sync_action md_sync_action(struct mddev *mddev);
@@ -868,8 +921,9 @@ void md_account_bio(struct mddev *mddev, struct bio **bio);
void md_free_cloned_bio(struct bio *bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
-extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
- sector_t sector, int size, struct page *page);
+void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
+ sector_t sector, int size, struct page *page,
+ unsigned int offset);
extern int md_super_wait(struct mddev *mddev);
extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
struct page *page, blk_opf_t opf, bool metadata_op);
@@ -901,7 +955,6 @@ extern void md_idle_sync_thread(struct mddev *mddev);
extern void md_frozen_sync_thread(struct mddev *mddev);
extern void md_unfrozen_sync_thread(struct mddev *mddev);
-extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev);
extern void mddev_destroy_serial_pool(struct mddev *mddev,
@@ -923,7 +976,6 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
}
}
-extern const struct md_cluster_operations *md_cluster_ops;
static inline int mddev_is_clustered(struct mddev *mddev)
{
return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
@@ -974,7 +1026,6 @@ struct mdu_array_info_s;
struct mdu_disk_info_s;
extern int mdp_major;
-extern struct workqueue_struct *md_bitmap_wq;
void md_autostart_arrays(int part);
int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
@@ -995,6 +1046,12 @@ static inline bool mddev_is_dm(struct mddev *mddev)
return !mddev->gendisk;
}
+static inline bool raid_is_456(struct mddev *mddev)
+{
+ return mddev->level == ID_RAID4 || mddev->level == ID_RAID5 ||
+ mddev->level == ID_RAID6;
+}
+
static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio,
sector_t sector)
{
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index f4f948b0e173..dbb97a7233ab 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -2,7 +2,7 @@
config DM_PERSISTENT_DATA
tristate
depends on BLK_DEV_DM
- select LIBCRC32C
+ select CRC32
select DM_BUFIO
help
Library providing immutable on-disk data structure support for
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 157c9bd2fed7..8f8792e55806 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -917,23 +917,27 @@ static int load_ablock(struct dm_array_cursor *c)
if (c->block)
unlock_ablock(c->info, c->block);
- c->block = NULL;
- c->ab = NULL;
c->index = 0;
r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le);
if (r) {
DMERR("dm_btree_cursor_get_value failed");
- dm_btree_cursor_end(&c->cursor);
+ goto out;
} else {
r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab);
if (r) {
DMERR("get_ablock failed");
- dm_btree_cursor_end(&c->cursor);
+ goto out;
}
}
+ return 0;
+
+out:
+ dm_btree_cursor_end(&c->cursor);
+ c->block = NULL;
+ c->ab = NULL;
return r;
}
@@ -956,10 +960,10 @@ EXPORT_SYMBOL_GPL(dm_array_cursor_begin);
void dm_array_cursor_end(struct dm_array_cursor *c)
{
- if (c->block) {
+ if (c->block)
unlock_ablock(c->info, c->block);
- dm_btree_cursor_end(&c->cursor);
- }
+
+ dm_btree_cursor_end(&c->cursor);
}
EXPORT_SYMBOL_GPL(dm_array_cursor_end);
@@ -999,6 +1003,7 @@ int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count)
}
count -= remaining;
+ c->index += (remaining - 1);
r = dm_array_cursor_next(c);
} while (!r);
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index c7ba4e6cbbc7..98c745d90f48 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -13,6 +13,7 @@
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/hash.h>
+#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/device-mapper.h>
@@ -77,7 +78,7 @@ static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm)
/*----------------------------------------------------------------*/
struct shadow_info {
- struct hlist_node hlist;
+ struct rb_node node;
dm_block_t where;
};
@@ -95,7 +96,7 @@ struct dm_transaction_manager {
struct dm_space_map *sm;
spinlock_t lock;
- struct hlist_head buckets[DM_HASH_SIZE];
+ struct rb_root buckets[DM_HASH_SIZE];
struct prefetch_set prefetches;
};
@@ -106,14 +107,22 @@ static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
{
int r = 0;
unsigned int bucket = dm_hash_block(b, DM_HASH_MASK);
- struct shadow_info *si;
+ struct rb_node **node;
spin_lock(&tm->lock);
- hlist_for_each_entry(si, tm->buckets + bucket, hlist)
- if (si->where == b) {
+ node = &tm->buckets[bucket].rb_node;
+ while (*node) {
+ struct shadow_info *si =
+ rb_entry(*node, struct shadow_info, node);
+ if (b == si->where) {
r = 1;
break;
}
+ if (b < si->where)
+ node = &si->node.rb_left;
+ else
+ node = &si->node.rb_right;
+ }
spin_unlock(&tm->lock);
return r;
@@ -130,30 +139,41 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
si = kmalloc(sizeof(*si), GFP_NOIO);
if (si) {
+ struct rb_node **node, *parent;
si->where = b;
bucket = dm_hash_block(b, DM_HASH_MASK);
+
spin_lock(&tm->lock);
- hlist_add_head(&si->hlist, tm->buckets + bucket);
+ node = &tm->buckets[bucket].rb_node;
+ parent = NULL;
+ while (*node) {
+ struct shadow_info *si =
+ rb_entry(*node, struct shadow_info, node);
+ parent = *node;
+ if (b < si->where)
+ node = &si->node.rb_left;
+ else
+ node = &si->node.rb_right;
+ }
+ rb_link_node(&si->node, parent, node);
+ rb_insert_color(&si->node, &tm->buckets[bucket]);
spin_unlock(&tm->lock);
}
}
static void wipe_shadow_table(struct dm_transaction_manager *tm)
{
- struct shadow_info *si;
- struct hlist_node *tmp;
- struct hlist_head *bucket;
- int i;
+ unsigned int i;
spin_lock(&tm->lock);
for (i = 0; i < DM_HASH_SIZE; i++) {
- bucket = tm->buckets + i;
- hlist_for_each_entry_safe(si, tmp, bucket, hlist)
+ while (!RB_EMPTY_ROOT(&tm->buckets[i])) {
+ struct shadow_info *si =
+ rb_entry(tm->buckets[i].rb_node, struct shadow_info, node);
+ rb_erase(&si->node, &tm->buckets[i]);
kfree(si);
-
- INIT_HLIST_HEAD(bucket);
+ }
}
-
spin_unlock(&tm->lock);
}
@@ -162,7 +182,7 @@ static void wipe_shadow_table(struct dm_transaction_manager *tm)
static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
struct dm_space_map *sm)
{
- int i;
+ unsigned int i;
struct dm_transaction_manager *tm;
tm = kmalloc(sizeof(*tm), GFP_KERNEL);
@@ -176,7 +196,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
spin_lock_init(&tm->lock);
for (i = 0; i < DM_HASH_SIZE; i++)
- INIT_HLIST_HEAD(tm->buckets + i);
+ tm->buckets[i] = RB_ROOT;
prefetch_init(&tm->prefetches);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index baaf5f8b80ae..985c377356eb 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -68,7 +68,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
struct strip_zone *zone;
int cnt;
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
- unsigned blksize = 512;
+ unsigned int blksize = 512;
+
+ if (!mddev_is_dm(mddev))
+ blksize = queue_logical_block_size(mddev->gendisk->queue);
*private_conf = ERR_PTR(-ENOMEM);
if (!conf)
@@ -84,7 +87,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
sector_div(sectors, mddev->chunk_sectors);
rdev1->sectors = sectors * mddev->chunk_sectors;
- blksize = max(blksize, queue_logical_block_size(
+ if (mddev_is_dm(mddev))
+ blksize = max(blksize, queue_logical_block_size(
rdev1->bdev->bd_disk->queue));
rdev_for_each(rdev2, mddev) {
@@ -382,13 +386,15 @@ static int raid0_set_limits(struct mddev *mddev)
md_init_stacking_limits(&lim);
lim.max_hw_sectors = mddev->chunk_sectors;
lim.max_write_zeroes_sectors = mddev->chunk_sectors;
+ lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors;
+ lim.logical_block_size = mddev->logical_block_size;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * mddev->raid_disks;
+ lim.chunk_sectors = mddev->chunk_sectors;
+ lim.features |= BLK_FEAT_ATOMIC_WRITES;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
- if (err) {
- queue_limits_cancel_update(mddev->gendisk->queue);
+ if (err)
return err;
- }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
@@ -404,6 +410,12 @@ static int raid0_run(struct mddev *mddev)
if (md_check_no_bitmap(mddev))
return -EINVAL;
+ if (!mddev_is_dm(mddev)) {
+ ret = raid0_set_limits(mddev);
+ if (ret)
+ return ret;
+ }
+
/* if private is not null, we are here after takeover */
if (mddev->private == NULL) {
ret = create_strip_zones(mddev, &conf);
@@ -412,11 +424,6 @@ static int raid0_run(struct mddev *mddev)
mddev->private = conf;
}
conf = mddev->private;
- if (!mddev_is_dm(mddev)) {
- ret = raid0_set_limits(mddev);
- if (ret)
- return ret;
- }
/* calculate array device size */
md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
@@ -463,21 +470,16 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
zone = find_zone(conf, &start);
if (bio_end_sector(bio) > zone->zone_end) {
- struct bio *split = bio_split(bio,
- zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
- &mddev->bio_set);
-
- if (IS_ERR(split)) {
- bio->bi_status = errno_to_blk_status(PTR_ERR(split));
- bio_endio(bio);
+ bio = bio_submit_split_bioset(bio,
+ zone->zone_end - bio->bi_iter.bi_sector,
+ &mddev->bio_set);
+ if (!bio)
return;
- }
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
+
end = zone->zone_end;
- } else
+ } else {
end = bio_end_sector(bio);
+ }
orig_end = end;
if (zone != conf->strip_zone)
@@ -612,17 +614,10 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
: sector_div(sector, chunk_sects));
if (sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, sectors, GFP_NOIO,
+ bio = bio_submit_split_bioset(bio, sectors,
&mddev->bio_set);
-
- if (IS_ERR(split)) {
- bio->bi_status = errno_to_blk_status(PTR_ERR(split));
- bio_endio(bio);
+ if (!bio)
return true;
- }
- bio_chain(split, bio);
- raid0_map_submit_bio(mddev, bio);
- bio = split;
}
raid0_map_submit_bio(mddev, bio);
@@ -674,7 +669,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
mddev->raid_disks--;
mddev->delta_disks = -1;
/* make sure it will be not marked as dirty */
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
create_strip_zones(mddev, &priv_conf);
@@ -717,7 +712,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
mddev->raid_disks += mddev->delta_disks;
mddev->degraded = 0;
/* make sure it will be not marked as dirty */
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
create_strip_zones(mddev, &priv_conf);
@@ -760,7 +755,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
mddev->delta_disks = 1 - mddev->raid_disks;
mddev->raid_disks = 1;
/* make sure it will be not marked as dirty */
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
create_strip_zones(mddev, &priv_conf);
@@ -810,9 +805,13 @@ static void raid0_quiesce(struct mddev *mddev, int quiesce)
static struct md_personality raid0_personality=
{
- .name = "raid0",
- .level = 0,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID0,
+ .name = "raid0",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid0_make_request,
.run = raid0_run,
.free = raid0_free,
@@ -823,14 +822,14 @@ static struct md_personality raid0_personality=
.error_handler = raid0_error,
};
-static int __init raid0_init (void)
+static int __init raid0_init(void)
{
- return register_md_personality (&raid0_personality);
+ return register_md_submodule(&raid0_personality.head);
}
-static void raid0_exit (void)
+static void __exit raid0_exit(void)
{
- unregister_md_personality (&raid0_personality);
+ unregister_md_submodule(&raid0_personality.head);
}
module_init(raid0_init);
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 4378d3250bd7..521625756128 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
* If bitmap is not enabled, it's safe to submit the io directly, and
* this can get optimal performance.
*/
- if (!mddev->bitmap_ops->enabled(mddev)) {
+ if (!md_bitmap_enabled(mddev, true)) {
raid1_submit_write(bio);
return true;
}
@@ -247,7 +247,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev,
sector_t this_sector, int *len)
{
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
/* no bad block overlap */
if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
@@ -283,13 +283,23 @@ static inline int raid1_check_read_range(struct md_rdev *rdev,
static inline bool raid1_should_read_first(struct mddev *mddev,
sector_t this_sector, int len)
{
- if ((mddev->recovery_cp < this_sector + len))
+ if ((mddev->resync_offset < this_sector + len))
return true;
if (mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, READ, this_sector,
- this_sector + len))
+ mddev->cluster_ops->area_resyncing(mddev, READ, this_sector,
+ this_sector + len))
return true;
return false;
}
+
+/*
+ * bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is
+ * submitted to the underlying disks, hence don't record badblocks or retry
+ * in this case.
+ */
+static inline bool raid1_should_handle_error(struct bio *bio)
+{
+ return !(bio->bi_opf & (REQ_RAHEAD | REQ_NOWAIT));
+}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a5adf08ee174..57d50465eed1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -36,6 +36,7 @@
#include "md.h"
#include "raid1.h"
#include "md-bitmap.h"
+#include "md-cluster.h"
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
@@ -45,6 +46,7 @@
static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
+static void raid1_free(struct mddev *mddev, void *priv);
#define RAID_1_10_NAME "raid1"
#include "raid1-10.c"
@@ -125,10 +127,9 @@ static inline struct r1bio *get_resync_r1bio(struct bio *bio)
return get_resync_pages(bio)->raid_bio;
}
-static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
+static void *r1bio_pool_alloc(gfp_t gfp_flags, struct r1conf *conf)
{
- struct pool_info *pi = data;
- int size = offsetof(struct r1bio, bios[pi->raid_disks]);
+ int size = offsetof(struct r1bio, bios[conf->raid_disks * 2]);
/* allocate a r1bio with room for raid_disks entries in the bios array */
return kzalloc(size, gfp_flags);
@@ -143,18 +144,18 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
{
- struct pool_info *pi = data;
+ struct r1conf *conf = data;
struct r1bio *r1_bio;
struct bio *bio;
int need_pages;
int j;
struct resync_pages *rps;
- r1_bio = r1bio_pool_alloc(gfp_flags, pi);
+ r1_bio = r1bio_pool_alloc(gfp_flags, conf);
if (!r1_bio)
return NULL;
- rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages),
+ rps = kmalloc_array(conf->raid_disks * 2, sizeof(struct resync_pages),
gfp_flags);
if (!rps)
goto out_free_r1bio;
@@ -162,11 +163,11 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
/*
* Allocate bios : 1 for reading, n-1 for writing
*/
- for (j = pi->raid_disks ; j-- ; ) {
+ for (j = conf->raid_disks * 2; j-- ; ) {
bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
if (!bio)
goto out_free_bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
+ bio_init_inline(bio, NULL, RESYNC_PAGES, 0);
r1_bio->bios[j] = bio;
}
/*
@@ -175,11 +176,11 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
* If this is a user-requested check/repair, allocate
* RESYNC_PAGES for each bio.
*/
- if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
- need_pages = pi->raid_disks;
+ if (test_bit(MD_RECOVERY_REQUESTED, &conf->mddev->recovery))
+ need_pages = conf->raid_disks * 2;
else
need_pages = 1;
- for (j = 0; j < pi->raid_disks; j++) {
+ for (j = 0; j < conf->raid_disks * 2; j++) {
struct resync_pages *rp = &rps[j];
bio = r1_bio->bios[j];
@@ -205,7 +206,7 @@ out_free_pages:
resync_free_pages(&rps[j]);
out_free_bio:
- while (++j < pi->raid_disks) {
+ while (++j < conf->raid_disks * 2) {
bio_uninit(r1_bio->bios[j]);
kfree(r1_bio->bios[j]);
}
@@ -218,12 +219,12 @@ out_free_r1bio:
static void r1buf_pool_free(void *__r1_bio, void *data)
{
- struct pool_info *pi = data;
+ struct r1conf *conf = data;
int i;
struct r1bio *r1bio = __r1_bio;
struct resync_pages *rp = NULL;
- for (i = pi->raid_disks; i--; ) {
+ for (i = conf->raid_disks * 2; i--; ) {
rp = get_resync_pages(r1bio->bios[i]);
resync_free_pages(rp);
bio_uninit(r1bio->bios[i]);
@@ -253,7 +254,7 @@ static void free_r1bio(struct r1bio *r1_bio)
struct r1conf *conf = r1_bio->mddev->private;
put_all_bios(conf, r1_bio);
- mempool_free(r1_bio, &conf->r1bio_pool);
+ mempool_free(r1_bio, conf->r1bio_pool);
}
static void put_buf(struct r1bio *r1_bio)
@@ -371,14 +372,16 @@ static void raid1_end_read_request(struct bio *bio)
*/
update_head_pos(r1_bio->read_disk, r1_bio);
- if (uptodate)
+ if (uptodate) {
set_bit(R1BIO_Uptodate, &r1_bio->state);
- else if (test_bit(FailFast, &rdev->flags) &&
- test_bit(R1BIO_FailFast, &r1_bio->state))
+ } else if (test_bit(FailFast, &rdev->flags) &&
+ test_bit(R1BIO_FailFast, &r1_bio->state)) {
/* This was a fail-fast read so we definitely
* want to retry */
;
- else {
+ } else if (!raid1_should_handle_error(bio)) {
+ uptodate = 1;
+ } else {
/* If all other devices have failed, we want to return
* the error upwards rather than fail the last device.
* Here we redefine "uptodate" to mean "Don't want to retry"
@@ -420,10 +423,8 @@ static void close_write(struct r1bio *r1_bio)
r1_bio->behind_master_bio = NULL;
}
- /* clear the bitmap if all writes complete successfully */
- mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state),
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->end_behind_write(mddev);
md_write_end(mddev);
}
@@ -451,16 +452,15 @@ static void raid1_end_write_request(struct bio *bio)
struct bio *to_put = NULL;
int mirror = find_bio_disk(r1_bio, bio);
struct md_rdev *rdev = conf->mirrors[mirror].rdev;
- bool discard_error;
sector_t lo = r1_bio->sector;
sector_t hi = r1_bio->sector + r1_bio->sectors;
-
- discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
+ bool ignore_error = !raid1_should_handle_error(bio) ||
+ (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
/*
* 'one mirror IO has finished' event handler:
*/
- if (bio->bi_status && !discard_error) {
+ if (bio->bi_status && !ignore_error) {
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED, &
@@ -480,8 +480,6 @@ static void raid1_end_write_request(struct bio *bio)
if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
else {
- /* Fail the request */
- set_bit(R1BIO_Degraded, &r1_bio->state);
/* Finished with this branch */
r1_bio->bios[mirror] = NULL;
to_put = bio;
@@ -513,7 +511,7 @@ static void raid1_end_write_request(struct bio *bio)
/* Maybe we can clear some bad blocks. */
if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
- !discard_error) {
+ !ignore_error) {
r1_bio->bios[mirror] = IO_MADE_GOOD;
set_bit(R1BIO_MadeGood, &r1_bio->state);
}
@@ -1227,7 +1225,7 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio,
int i = 0;
struct bio *behind_bio = NULL;
- behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO,
+ behind_bio = bio_alloc_bioset(NULL, vcnt, bio->bi_opf, GFP_NOIO,
&r1_bio->mddev->bio_set);
/* discard op, we don't support writezero/writesame yet */
@@ -1306,9 +1304,8 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio)
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
- r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO);
- /* Ensure no bio records IO_BLOCKED */
- memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+ memset(r1_bio, 0, offsetof(struct r1bio, bios[conf->raid_disks * 2]));
init_r1bio(r1_bio, mddev, bio);
return r1_bio;
}
@@ -1319,10 +1316,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
struct r1conf *conf = mddev->private;
struct raid1_info *mirror;
struct bio *read_bio;
- const enum req_op op = bio_op(bio);
- const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
int max_sectors;
- int rdisk, error;
+ int rdisk;
bool r1bio_existed = !!r1_bio;
/*
@@ -1371,7 +1366,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
(unsigned long long)r1_bio->sector,
mirror->rdev->bdev);
- if (test_bit(WriteMostly, &mirror->rdev->flags)) {
+ if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+ md_bitmap_enabled(mddev, false)) {
/*
* Reading from a write-mostly device must take care not to
* over-take any writes that are 'behind'
@@ -1381,16 +1377,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
}
if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- gfp, &conf->bio_split);
-
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
+ bio = bio_submit_split_bioset(bio, max_sectors,
+ &conf->bio_split);
+ if (!bio) {
+ set_bit(R1BIO_Returned, &r1_bio->state);
goto err_handle;
}
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
+
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
}
@@ -1402,13 +1395,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
}
read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
&mddev->bio_set);
-
+ read_bio->bi_opf &= ~REQ_NOWAIT;
r1_bio->bios[rdisk] = read_bio;
read_bio->bi_iter.bi_sector = r1_bio->sector +
mirror->rdev->data_offset;
read_bio->bi_end_io = raid1_end_read_request;
- read_bio->bi_opf = op | do_sync;
if (test_bit(FailFast, &mirror->rdev->flags) &&
test_bit(R1BIO_FailFast, &r1_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
@@ -1419,8 +1411,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
err_handle:
atomic_dec(&mirror->rdev->nr_pending);
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R1BIO_Uptodate, &r1_bio->state);
raid_end_bio_io(r1_bio);
}
@@ -1458,12 +1448,36 @@ retry:
return true;
}
+static void raid1_start_write_behind(struct mddev *mddev, struct r1bio *r1_bio,
+ struct bio *bio)
+{
+ unsigned long max_write_behind = mddev->bitmap_info.max_write_behind;
+ struct md_bitmap_stats stats;
+ int err;
+
+ /* behind write rely on bitmap, see bitmap_operations */
+ if (!md_bitmap_enabled(mddev, false))
+ return;
+
+ err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (err)
+ return;
+
+ /* Don't do behind IO if reader is waiting, or there are too many. */
+ if (!stats.behind_wait && stats.behind_writes < max_write_behind)
+ alloc_behind_master_bio(r1_bio, bio);
+
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->start_behind_write(mddev);
+
+}
+
static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int max_write_sectors)
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
- int i, disks, k, error;
+ int i, disks, k;
unsigned long flags;
int first_clone;
int max_sectors;
@@ -1471,7 +1485,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
bool is_discard = (bio_op(bio) == REQ_OP_DISCARD);
if (mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, WRITE,
+ mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))) {
DEFINE_WAIT(w);
@@ -1482,7 +1496,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
for (;;) {
prepare_to_wait(&conf->wait_barrier,
&w, TASK_IDLE);
- if (!md_cluster_ops->area_resyncing(mddev, WRITE,
+ if (!mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector,
bio_end_sector(bio)))
break;
@@ -1535,16 +1549,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
write_behind = true;
r1_bio->bios[i] = NULL;
- if (!rdev || test_bit(Faulty, &rdev->flags)) {
- if (i < conf->raid_disks)
- set_bit(R1BIO_Degraded, &r1_bio->state);
+ if (!rdev || test_bit(Faulty, &rdev->flags))
continue;
- }
atomic_inc(&rdev->nr_pending);
if (test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
int is_bad;
is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
@@ -1558,20 +1569,22 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
*/
max_sectors = bad_sectors;
rdev_dec_pending(rdev, mddev);
- /* We don't set R1BIO_Degraded as that
- * only applies if the disk is
- * missing, so it might be re-added,
- * and we want to know to recover this
- * chunk.
- * In this case the device is here,
- * and the fact that this chunk is not
- * in-sync is recorded in the bad
- * block log
- */
continue;
}
if (is_bad) {
- int good_sectors = first_bad - r1_bio->sector;
+ int good_sectors;
+
+ /*
+ * We cannot atomically write this, so just
+ * error in that case. It could be possible to
+ * atomically write other mirrors, but the
+ * complexity of supporting that is not worth
+ * the benefit.
+ */
+ if (bio->bi_opf & REQ_ATOMIC)
+ goto err_handle;
+
+ good_sectors = first_bad - r1_bio->sector;
if (good_sectors < max_sectors)
max_sectors = good_sectors;
}
@@ -1589,16 +1602,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
max_sectors = min_t(int, max_sectors,
BIO_MAX_VECS * (PAGE_SIZE >> 9));
if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- GFP_NOIO, &conf->bio_split);
-
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
+ bio = bio_submit_split_bioset(bio, max_sectors,
+ &conf->bio_split);
+ if (!bio) {
+ set_bit(R1BIO_Returned, &r1_bio->state);
goto err_handle;
}
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
+
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
}
@@ -1617,23 +1627,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
continue;
if (first_clone) {
- unsigned long max_write_behind =
- mddev->bitmap_info.max_write_behind;
- struct md_bitmap_stats stats;
- int err;
-
- /* do behind I/O ?
- * Not if there are too many, or cannot
- * allocate memory, or a reader on WriteMostly
- * is waiting for behind writes to flush */
- err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
- if (!err && write_behind && !stats.behind_wait &&
- stats.behind_writes < max_write_behind)
- alloc_behind_master_bio(r1_bio, bio);
-
- mddev->bitmap_ops->startwrite(
- mddev, r1_bio->sector, r1_bio->sectors,
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ if (write_behind)
+ raid1_start_write_behind(mddev, r1_bio, bio);
first_clone = 0;
}
@@ -1653,11 +1648,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
wait_for_serialization(rdev, r1_bio);
}
+ mbio->bi_opf &= ~REQ_NOWAIT;
r1_bio->bios[i] = mbio;
mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset);
mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
if (test_bit(FailFast, &rdev->flags) &&
!test_bit(WriteMostly, &rdev->flags) &&
conf->raid_disks - mddev->degraded > 1)
@@ -1689,8 +1684,6 @@ err_handle:
}
}
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R1BIO_Uptodate, &r1_bio->state);
raid_end_bio_io(r1_bio);
}
@@ -2063,7 +2056,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
/* make sure these bits don't get cleared. */
do {
- mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks);
+ md_bitmap_end_sync(mddev, s, &sync_blocks);
s += sync_blocks;
sectors_to_go -= sync_blocks;
} while (sectors_to_go > 0);
@@ -2206,14 +2199,9 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
if (!rdev_set_badblocks(rdev, sect, s, 0))
abort = 1;
}
- if (abort) {
- conf->recovery_disabled =
- mddev->recovery_disabled;
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_done_sync(mddev, r1_bio->sectors, 0);
- put_buf(r1_bio);
+ if (abort)
return 0;
- }
+
/* Try next page */
sectors -= s;
sect += s;
@@ -2352,10 +2340,21 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
int disks = conf->raid_disks * 2;
struct bio *wbio;
- if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
- /* ouch - failed to read all of that. */
- if (!fix_sync_read_error(r1_bio))
+ if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+ /*
+ * ouch - failed to read all of that.
+ * No need to fix read error for check/repair
+ * because all member disks are read.
+ */
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) ||
+ !fix_sync_read_error(r1_bio)) {
+ conf->recovery_disabled = mddev->recovery_disabled;
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_done_sync(mddev, r1_bio->sectors, 0);
+ put_buf(r1_bio);
return;
+ }
+ }
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
process_checks(r1_bio);
@@ -2382,7 +2381,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
wbio->bi_end_io = end_sync_write;
atomic_inc(&r1_bio->remaining);
- md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
@@ -2489,7 +2487,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
}
}
-static int narrow_write_error(struct r1bio *r1_bio, int i)
+static bool narrow_write_error(struct r1bio *r1_bio, int i)
{
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
@@ -2510,10 +2508,10 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
sector_t sector;
int sectors;
int sect_to_write = r1_bio->sectors;
- int ok = 1;
+ bool ok = true;
if (rdev->badblocks.shift < 0)
- return 0;
+ return false;
block_sectors = roundup(1 << rdev->badblocks.shift,
bdev_logical_block_size(rdev->bdev) >> 9);
@@ -2599,12 +2597,10 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
* errors.
*/
fail = true;
- if (!narrow_write_error(r1_bio, m)) {
+ if (!narrow_write_error(r1_bio, m))
md_error(conf->mddev,
conf->mirrors[m].rdev);
/* an I/O failed, we can't clear the bitmap */
- set_bit(R1BIO_Degraded, &r1_bio->state);
- }
rdev_dec_pending(conf->mirrors[m].rdev,
conf->mddev);
}
@@ -2695,8 +2691,6 @@ static void raid1d(struct md_thread *thread)
list_del(&r1_bio->retry_list);
idx = sector_to_idx(r1_bio->sector);
atomic_dec(&conf->nr_queued[idx]);
- if (mddev->degraded)
- set_bit(R1BIO_Degraded, &r1_bio->state);
if (test_bit(R1BIO_WriteError, &r1_bio->state))
close_write(r1_bio);
raid_end_bio_io(r1_bio);
@@ -2750,7 +2744,7 @@ static int init_resync(struct r1conf *conf)
BUG_ON(mempool_initialized(&conf->r1buf_pool));
return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc,
- r1buf_pool_free, conf->poolinfo);
+ r1buf_pool_free, conf);
}
static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
@@ -2760,7 +2754,7 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
struct bio *bio;
int i;
- for (i = conf->poolinfo->raid_disks; i--; ) {
+ for (i = conf->raid_disks * 2; i--; ) {
bio = r1bio->bios[i];
rps = bio->bi_private;
bio_reset(bio, NULL, 0);
@@ -2809,12 +2803,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* We can find the current addess in mddev->curr_resync
*/
if (mddev->curr_resync < max_sector) /* aborted */
- mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else /* completed sync */
conf->fullsync = 0;
- mddev->bitmap_ops->close_sync(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
if (mddev_is_clustered(mddev)) {
@@ -2825,7 +2820,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
}
if (mddev->bitmap == NULL &&
- mddev->recovery_cp == MaxSector &&
+ mddev->resync_offset == MaxSector &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
conf->fullsync == 0) {
*skipped = 1;
@@ -2834,7 +2829,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
/* before building a request, check if we can skip these blocks..
* This call the bitmap_start_sync doesn't actually record anything
*/
- if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) &&
+ if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We can skip this block, and probably several more */
*skipped = 1;
@@ -2851,10 +2846,11 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
/* we are incrementing sector_nr below. To be safe, we check against
* sector_nr + two times RESYNC_SECTORS
*/
-
- mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
- mddev_is_clustered(mddev) &&
- (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
+ mddev_is_clustered(mddev) &&
+ (sector_nr + 2 * RESYNC_SECTORS >
+ conf->cluster_sync_high));
if (raise_barrier(conf, sector_nr))
return 0;
@@ -2893,7 +2889,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
} else {
/* may need to read from here */
sector_t first_bad = MaxSector;
- int bad_sectors;
+ sector_t bad_sectors;
if (is_badblock(rdev, sector_nr, good_sectors,
&first_bad, &bad_sectors)) {
@@ -3009,8 +3005,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (len == 0)
break;
if (sync_blocks == 0) {
- if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
- &sync_blocks, still_degraded) &&
+ if (!md_bitmap_start_sync(mddev, sector_nr,
+ &sync_blocks, still_degraded) &&
!conf->fullsync &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
break;
@@ -3045,9 +3041,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
conf->cluster_sync_low = mddev->curr_resync_completed;
conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
/* Send resync message */
- md_cluster_ops->resync_info_update(mddev,
- conf->cluster_sync_low,
- conf->cluster_sync_high);
+ mddev->cluster_ops->resync_info_update(mddev,
+ conf->cluster_sync_low,
+ conf->cluster_sync_high);
}
/* For a user-requested sync, we read all readable devices and do a
@@ -3059,7 +3055,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
read_targets--;
- md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);
@@ -3068,7 +3063,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
} else {
atomic_set(&r1_bio->remaining, 1);
bio = r1_bio->bios[r1_bio->read_disk];
- md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);
@@ -3090,6 +3084,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
int i;
struct raid1_info *disk;
struct md_rdev *rdev;
+ size_t r1bio_size;
int err = -ENOMEM;
conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
@@ -3126,21 +3121,15 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf->tmppage)
goto abort;
- conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
- if (!conf->poolinfo)
- goto abort;
- conf->poolinfo->raid_disks = mddev->raid_disks * 2;
- err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc,
- rbio_pool_free, conf->poolinfo);
- if (err)
+ r1bio_size = offsetof(struct r1bio, bios[mddev->raid_disks * 2]);
+ conf->r1bio_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, r1bio_size);
+ if (!conf->r1bio_pool)
goto abort;
err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
if (err)
goto abort;
- conf->poolinfo->mddev = mddev;
-
err = -EINVAL;
spin_lock_init(&conf->device_lock);
conf->raid_disks = mddev->raid_disks;
@@ -3203,10 +3192,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
abort:
if (conf) {
- mempool_exit(&conf->r1bio_pool);
+ mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
- kfree(conf->poolinfo);
kfree(conf->nr_pending);
kfree(conf->nr_waiting);
kfree(conf->nr_queued);
@@ -3224,11 +3212,12 @@ static int raid1_set_limits(struct mddev *mddev)
md_init_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
+ lim.max_hw_wzeroes_unmap_sectors = 0;
+ lim.logical_block_size = mddev->logical_block_size;
+ lim.features |= BLK_FEAT_ATOMIC_WRITES;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
- if (err) {
- queue_limits_cancel_update(mddev->gendisk->queue);
+ if (err)
return err;
- }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
@@ -3264,8 +3253,11 @@ static int raid1_run(struct mddev *mddev)
if (!mddev_is_dm(mddev)) {
ret = raid1_set_limits(mddev);
- if (ret)
+ if (ret) {
+ if (!mddev->private)
+ raid1_free(mddev, conf);
return ret;
+ }
}
mddev->degraded = 0;
@@ -3279,13 +3271,15 @@ static int raid1_run(struct mddev *mddev)
*/
if (conf->raid_disks - mddev->degraded < 1) {
md_unregister_thread(mddev, &conf->thread);
+ if (!mddev->private)
+ raid1_free(mddev, conf);
return -EINVAL;
}
if (conf->raid_disks - mddev->degraded == 1)
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
- if (mddev->recovery_cp != MaxSector)
+ if (mddev->resync_offset != MaxSector)
pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
mdname(mddev));
pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
@@ -3312,10 +3306,9 @@ static void raid1_free(struct mddev *mddev, void *priv)
{
struct r1conf *conf = priv;
- mempool_exit(&conf->r1bio_pool);
+ mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
- kfree(conf->poolinfo);
kfree(conf->nr_pending);
kfree(conf->nr_waiting);
kfree(conf->nr_queued);
@@ -3334,20 +3327,22 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
* worth it.
*/
sector_t newsize = raid1_size(mddev, sectors, 0);
- int ret;
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
- ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
- if (ret)
- return ret;
+ if (md_bitmap_enabled(mddev, false)) {
+ int ret = mddev->bitmap_ops->resize(mddev, newsize, 0);
+
+ if (ret)
+ return ret;
+ }
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
- mddev->recovery_cp > mddev->dev_sectors) {
- mddev->recovery_cp = mddev->dev_sectors;
+ mddev->resync_offset > mddev->dev_sectors) {
+ mddev->resync_offset = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
mddev->dev_sectors = sectors;
@@ -3368,17 +3363,13 @@ static int raid1_reshape(struct mddev *mddev)
* At the same time, we "pack" the devices so that all the missing
* devices have the higher raid_disk numbers.
*/
- mempool_t newpool, oldpool;
- struct pool_info *newpoolinfo;
+ mempool_t *newpool, *oldpool;
+ size_t new_r1bio_size;
struct raid1_info *newmirrors;
struct r1conf *conf = mddev->private;
int cnt, raid_disks;
unsigned long flags;
int d, d2;
- int ret;
-
- memset(&newpool, 0, sizeof(newpool));
- memset(&oldpool, 0, sizeof(oldpool));
/* Cannot change chunk_size, layout, or level */
if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
@@ -3404,24 +3395,16 @@ static int raid1_reshape(struct mddev *mddev)
return -EBUSY;
}
- newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
- if (!newpoolinfo)
+ new_r1bio_size = offsetof(struct r1bio, bios[raid_disks * 2]);
+ newpool = mempool_create_kmalloc_pool(NR_RAID_BIOS, new_r1bio_size);
+ if (!newpool) {
return -ENOMEM;
- newpoolinfo->mddev = mddev;
- newpoolinfo->raid_disks = raid_disks * 2;
-
- ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc,
- rbio_pool_free, newpoolinfo);
- if (ret) {
- kfree(newpoolinfo);
- return ret;
}
newmirrors = kzalloc(array3_size(sizeof(struct raid1_info),
raid_disks, 2),
GFP_KERNEL);
if (!newmirrors) {
- kfree(newpoolinfo);
- mempool_exit(&newpool);
+ mempool_destroy(newpool);
return -ENOMEM;
}
@@ -3446,8 +3429,6 @@ static int raid1_reshape(struct mddev *mddev)
}
kfree(conf->mirrors);
conf->mirrors = newmirrors;
- kfree(conf->poolinfo);
- conf->poolinfo = newpoolinfo;
spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded += (raid_disks - conf->raid_disks);
@@ -3461,7 +3442,7 @@ static int raid1_reshape(struct mddev *mddev)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
- mempool_exit(&oldpool);
+ mempool_destroy(oldpool);
return 0;
}
@@ -3499,9 +3480,13 @@ static void *raid1_takeover(struct mddev *mddev)
static struct md_personality raid1_personality =
{
- .name = "raid1",
- .level = 1,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID1,
+ .name = "raid1",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid1_make_request,
.run = raid1_run,
.free = raid1_free,
@@ -3518,18 +3503,18 @@ static struct md_personality raid1_personality =
.takeover = raid1_takeover,
};
-static int __init raid_init(void)
+static int __init raid1_init(void)
{
- return register_md_personality(&raid1_personality);
+ return register_md_submodule(&raid1_personality.head);
}
-static void raid_exit(void)
+static void __exit raid1_exit(void)
{
- unregister_md_personality(&raid1_personality);
+ unregister_md_submodule(&raid1_personality.head);
}
-module_init(raid_init);
-module_exit(raid_exit);
+module_init(raid1_init);
+module_exit(raid1_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
MODULE_ALIAS("md-personality-3"); /* RAID1 */
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5300cbaa58a4..2ebe35aaa534 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -49,22 +49,6 @@ struct raid1_info {
sector_t seq_start;
};
-/*
- * memory pools need a pointer to the mddev, so they can force an unplug
- * when memory is tight, and a count of the number of drives that the
- * pool was allocated for, so they know how much to allocate and free.
- * mddev->raid_disks cannot be used, as it can change while a pool is active
- * These two datums are stored in a kmalloced struct.
- * The 'raid_disks' here is twice the raid_disks in r1conf.
- * This allows space for each 'real' device can have a replacement in the
- * second half of the array.
- */
-
-struct pool_info {
- struct mddev *mddev;
- int raid_disks;
-};
-
struct r1conf {
struct mddev *mddev;
struct raid1_info *mirrors; /* twice 'raid_disks' to
@@ -114,11 +98,7 @@ struct r1conf {
*/
int recovery_disabled;
- /* poolinfo contains information about the content of the
- * mempools - it changes when the array grows or shrinks
- */
- struct pool_info *poolinfo;
- mempool_t r1bio_pool;
+ mempool_t *r1bio_pool;
mempool_t r1buf_pool;
struct bio_set bio_split;
@@ -188,7 +168,6 @@ struct r1bio {
enum r1bio_state {
R1BIO_Uptodate,
R1BIO_IsSync,
- R1BIO_Degraded,
R1BIO_BehindIO,
/* Set ReadError on bios that experience a readerror so that
* raid1d knows what to do with them.
@@ -199,7 +178,9 @@ enum r1bio_state {
* any write was successful. Otherwise we call when
* any write-behind write succeeds, otherwise we call
* with failure when last write completes (and all failed).
- * Record that bi_end_io was called with this flag...
+ *
+ * And for bio_split errors, record that bi_end_io was called
+ * with this flag...
*/
R1BIO_Returned,
/* If a write for this request means we can clear some
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 18989231791a..84be4cc7e873 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -24,6 +24,7 @@
#include "raid10.h"
#include "raid0.h"
#include "md-bitmap.h"
+#include "md-cluster.h"
/*
* RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -162,14 +163,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
if (!bio)
goto out_free_bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
+ bio_init_inline(bio, NULL, RESYNC_PAGES, 0);
r10_bio->devs[j].bio = bio;
if (!conf->have_replacement)
continue;
bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
if (!bio)
goto out_free_bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
+ bio_init_inline(bio, NULL, RESYNC_PAGES, 0);
r10_bio->devs[j].repl_bio = bio;
}
/*
@@ -321,10 +322,12 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
struct bio *bio = r10_bio->master_bio;
struct r10conf *conf = r10_bio->mddev->private;
- if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
- bio->bi_status = BLK_STS_IOERR;
+ if (!test_and_set_bit(R10BIO_Returned, &r10_bio->state)) {
+ if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ }
- bio_endio(bio);
/*
* Wake up any possible resync thread that waits for the device
* to go idle.
@@ -398,6 +401,8 @@ static void raid10_end_read_request(struct bio *bio)
* wait for the 'master' bio.
*/
set_bit(R10BIO_Uptodate, &r10_bio->state);
+ } else if (!raid1_should_handle_error(bio)) {
+ uptodate = 1;
} else {
/* If all other devices that store this block have
* failed, we want to return the error upwards rather
@@ -428,10 +433,6 @@ static void close_write(struct r10bio *r10_bio)
{
struct mddev *mddev = r10_bio->mddev;
- /* clear the bitmap if all writes complete successfully */
- mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
- !test_bit(R10BIO_Degraded, &r10_bio->state),
- false);
md_write_end(mddev);
}
@@ -459,9 +460,8 @@ static void raid10_end_write_request(struct bio *bio)
int slot, repl;
struct md_rdev *rdev = NULL;
struct bio *to_put = NULL;
- bool discard_error;
-
- discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
+ bool ignore_error = !raid1_should_handle_error(bio) ||
+ (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
@@ -475,7 +475,7 @@ static void raid10_end_write_request(struct bio *bio)
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
- if (bio->bi_status && !discard_error) {
+ if (bio->bi_status && !ignore_error) {
if (repl)
/* Never record new bad blocks to replacement,
* just fail it.
@@ -501,7 +501,6 @@ static void raid10_end_write_request(struct bio *bio)
set_bit(R10BIO_WriteError, &r10_bio->state);
else {
/* Fail the request */
- set_bit(R10BIO_Degraded, &r10_bio->state);
r10_bio->devs[slot].bio = NULL;
to_put = bio;
dec_rdev = 1;
@@ -531,7 +530,7 @@ static void raid10_end_write_request(struct bio *bio)
/* Maybe we can clear some bad blocks. */
if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
r10_bio->sectors) &&
- !discard_error) {
+ !ignore_error) {
bio_put(bio);
if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
@@ -752,7 +751,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
for (slot = 0; slot < conf->copies ; slot++) {
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
sector_t dev_sector;
unsigned int pending;
bool nonrot;
@@ -1151,15 +1150,12 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
{
struct r10conf *conf = mddev->private;
struct bio *read_bio;
- const enum req_op op = bio_op(bio);
- const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
int max_sectors;
struct md_rdev *rdev;
char b[BDEVNAME_SIZE];
int slot = r10_bio->read_slot;
struct md_rdev *err_rdev = NULL;
gfp_t gfp = GFP_NOIO;
- int error;
if (slot >= 0 && r10_bio->devs[slot].rdev) {
/*
@@ -1187,8 +1183,11 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
}
- if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors))
+ if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) {
+ raid_end_bio_io(r10_bio);
return;
+ }
+
rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) {
if (err_rdev) {
@@ -1205,17 +1204,15 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
rdev->bdev,
(unsigned long long)r10_bio->sector);
if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- gfp, &conf->bio_split);
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
- goto err_handle;
- }
- bio_chain(split, bio);
allow_barrier(conf);
- submit_bio_noacct(bio);
+ bio = bio_submit_split_bioset(bio, max_sectors,
+ &conf->bio_split);
wait_barrier(conf, false);
- bio = split;
+ if (!bio) {
+ set_bit(R10BIO_Returned, &r10_bio->state);
+ goto err_handle;
+ }
+
r10_bio->master_bio = bio;
r10_bio->sectors = max_sectors;
}
@@ -1226,6 +1223,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
r10_bio->master_bio = bio;
}
read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
+ read_bio->bi_opf &= ~REQ_NOWAIT;
r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
@@ -1233,7 +1231,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
choose_data_offset(r10_bio, rdev);
read_bio->bi_end_io = raid10_end_read_request;
- read_bio->bi_opf = op | do_sync;
if (test_bit(FailFast, &rdev->flags) &&
test_bit(R10BIO_FailFast, &r10_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
@@ -1243,8 +1240,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
return;
err_handle:
atomic_dec(&rdev->nr_pending);
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
}
@@ -1252,9 +1247,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
struct bio *bio, bool replacement,
int n_copy)
{
- const enum req_op op = bio_op(bio);
- const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
- const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
unsigned long flags;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev;
@@ -1265,6 +1257,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
conf->mirrors[devnum].rdev;
mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set);
+ mbio->bi_opf &= ~REQ_NOWAIT;
if (replacement)
r10_bio->devs[n_copy].repl_bio = mbio;
else
@@ -1273,7 +1266,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
choose_data_offset(r10_bio, rdev));
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_opf = op | do_sync | do_fua;
if (!replacement && test_bit(FailFast,
&conf->mirrors[devnum].rdev->flags)
&& enough(conf, devnum))
@@ -1356,12 +1348,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
int i, k;
sector_t sectors;
int max_sectors;
- int error;
if ((mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, WRITE,
- bio->bi_iter.bi_sector,
- bio_end_sector(bio)))) {
+ mddev->cluster_ops->area_resyncing(mddev, WRITE,
+ bio->bi_iter.bi_sector,
+ bio_end_sector(bio)))) {
DEFINE_WAIT(w);
/* Bail out if REQ_NOWAIT is set for the bio */
if (bio->bi_opf & REQ_NOWAIT) {
@@ -1371,7 +1362,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
for (;;) {
prepare_to_wait(&conf->wait_barrier,
&w, TASK_IDLE);
- if (!md_cluster_ops->area_resyncing(mddev, WRITE,
+ if (!mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio)))
break;
schedule();
@@ -1380,8 +1371,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
}
sectors = r10_bio->sectors;
- if (!regular_request_wait(mddev, conf, bio, sectors))
+ if (!regular_request_wait(mddev, conf, bio, sectors)) {
+ raid_end_bio_io(r10_bio);
return;
+ }
+
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
(mddev->reshape_backwards
? (bio->bi_iter.bi_sector < conf->reshape_safe &&
@@ -1437,14 +1431,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->devs[i].bio = NULL;
r10_bio->devs[i].repl_bio = NULL;
- if (!rdev && !rrdev) {
- set_bit(R10BIO_Degraded, &r10_bio->state);
+ if (!rdev && !rrdev)
continue;
- }
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
- int bad_sectors;
+ sector_t bad_sectors;
int is_bad;
is_bad = is_badblock(rdev, dev_sector, max_sectors,
@@ -1457,18 +1449,22 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
* to other devices yet
*/
max_sectors = bad_sectors;
- /* We don't set R10BIO_Degraded as that
- * only applies if the disk is missing,
- * so it might be re-added, and we want to
- * know to recover this chunk.
- * In this case the device is here, and the
- * fact that this chunk is not in-sync is
- * recorded in the bad block log.
- */
continue;
}
if (is_bad) {
- int good_sectors = first_bad - dev_sector;
+ int good_sectors;
+
+ /*
+ * We cannot atomically write this, so just
+ * error in that case. It could be possible to
+ * atomically write other mirrors, but the
+ * complexity of supporting that is not worth
+ * the benefit.
+ */
+ if (bio->bi_opf & REQ_ATOMIC)
+ goto err_handle;
+
+ good_sectors = first_bad - dev_sector;
if (good_sectors < max_sectors)
max_sectors = good_sectors;
}
@@ -1487,25 +1483,21 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->sectors = max_sectors;
if (r10_bio->sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, r10_bio->sectors,
- GFP_NOIO, &conf->bio_split);
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
- goto err_handle;
- }
- bio_chain(split, bio);
allow_barrier(conf);
- submit_bio_noacct(bio);
+ bio = bio_submit_split_bioset(bio, r10_bio->sectors,
+ &conf->bio_split);
wait_barrier(conf, false);
- bio = split;
+ if (!bio) {
+ set_bit(R10BIO_Returned, &r10_bio->state);
+ goto err_handle;
+ }
+
r10_bio->master_bio = bio;
}
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
- mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors,
- false);
for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
@@ -1531,8 +1523,6 @@ err_handle:
}
}
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
}
@@ -1633,11 +1623,10 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return -EAGAIN;
- if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) {
+ if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
bio_wouldblock_error(bio);
return 0;
}
- wait_barrier(conf, false);
/*
* Check reshape again to avoid reshape happens after checking
@@ -1680,7 +1669,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
bio_endio(bio);
return 0;
}
+
bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
allow_barrier(conf);
/* Resend the fist split part */
submit_bio_noacct(split);
@@ -1695,7 +1686,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
bio_endio(bio);
return 0;
}
+
bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
allow_barrier(conf);
/* Resend the second split part */
submit_bio_noacct(bio);
@@ -1745,6 +1738,7 @@ retry_discard:
* The discard bio returns only first r10bio finishes
*/
if (first_copy) {
+ md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
set_bit(R10BIO_Discard, &r10_bio->state);
first_copy = false;
@@ -2117,7 +2111,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int last = conf->geo.raid_disks - 1;
struct raid10_info *p;
- if (mddev->recovery_cp < MaxSector)
+ if (mddev->resync_offset < MaxSector)
/* only hot-add to in-sync arrays, as recovery is
* very different from resync
*/
@@ -2435,7 +2429,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
- md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
tbio->bi_opf |= MD_FAILFAST;
@@ -2447,18 +2440,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* that are active
*/
for (i = 0; i < conf->copies; i++) {
- int d;
-
tbio = r10_bio->devs[i].repl_bio;
if (!tbio || !tbio->bi_end_io)
continue;
if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
&& r10_bio->devs[i].bio != fbio)
bio_copy_data(tbio, fbio);
- d = r10_bio->devs[i].devnum;
atomic_inc(&r10_bio->remaining);
- md_sync_acct(conf->mirrors[d].replacement->bdev,
- bio_sectors(tbio));
submit_bio_noacct(tbio);
}
@@ -2592,13 +2580,10 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
d = r10_bio->devs[1].devnum;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
if (wbio2) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
- md_sync_acct(conf->mirrors[d].replacement->bdev,
- bio_sectors(wbio2));
submit_bio_noacct(wbio2);
}
}
@@ -2788,7 +2773,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
}
}
-static int narrow_write_error(struct r10bio *r10_bio, int i)
+static bool narrow_write_error(struct r10bio *r10_bio, int i)
{
struct bio *bio = r10_bio->master_bio;
struct mddev *mddev = r10_bio->mddev;
@@ -2809,10 +2794,10 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
sector_t sector;
int sectors;
int sect_to_write = r10_bio->sectors;
- int ok = 1;
+ bool ok = true;
if (rdev->badblocks.shift < 0)
- return 0;
+ return false;
block_sectors = roundup(1 << rdev->badblocks.shift,
bdev_logical_block_size(rdev->bdev) >> 9);
@@ -2951,11 +2936,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev_dec_pending(rdev, conf->mddev);
} else if (bio != NULL && bio->bi_status) {
fail = true;
- if (!narrow_write_error(r10_bio, m)) {
+ if (!narrow_write_error(r10_bio, m))
md_error(conf->mddev, rdev);
- set_bit(R10BIO_Degraded,
- &r10_bio->state);
- }
rdev_dec_pending(rdev, conf->mddev);
}
bio = r10_bio->devs[m].repl_bio;
@@ -3014,8 +2996,6 @@ static void raid10d(struct md_thread *thread)
r10_bio = list_first_entry(&tmp, struct r10bio,
retry_list);
list_del(&r10_bio->retry_list);
- if (mddev->degraded)
- set_bit(R10BIO_Degraded, &r10_bio->state);
if (test_bit(R10BIO_WriteError,
&r10_bio->state))
@@ -3199,7 +3179,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* of a clean array, like RAID1 does.
*/
if (mddev->bitmap == NULL &&
- mddev->recovery_cp == MaxSector &&
+ mddev->resync_offset == MaxSector &&
mddev->reshape_position == MaxSector &&
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
@@ -3235,15 +3215,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
- mddev->bitmap_ops->end_sync(mddev,
- mddev->curr_resync,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else for (i = 0; i < conf->geo.raid_disks; i++) {
sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i);
- mddev->bitmap_ops->end_sync(mddev, sect,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, sect, &sync_blocks);
}
} else {
/* completed sync */
@@ -3263,7 +3241,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
conf->fullsync = 0;
}
- mddev->bitmap_ops->close_sync(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
*skipped = 1;
return sectors_skipped;
@@ -3365,9 +3344,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* we only need to recover the block if it is set in
* the bitmap
*/
- must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
- &sync_blocks,
- true);
+ must_sync = md_bitmap_start_sync(mddev, sect,
+ &sync_blocks, true);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
@@ -3410,9 +3388,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
}
- must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
- &sync_blocks, still_degraded);
-
+ md_bitmap_start_sync(mddev, sect, &sync_blocks,
+ still_degraded);
any_working = 0;
for (j=0; j<conf->copies;j++) {
int k;
@@ -3420,7 +3397,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t from_addr, to_addr;
struct md_rdev *rdev = conf->mirrors[d].rdev;
sector_t sector, first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
if (!rdev ||
!test_bit(In_sync, &rdev->flags))
continue;
@@ -3584,13 +3561,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* safety reason, which ensures curr_resync_completed is
* updated in bitmap_cond_end_sync.
*/
- mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
- if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
- &sync_blocks,
- mddev->degraded) &&
+ if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks,
+ mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
&mddev->recovery)) {
/* We can skip this block */
@@ -3616,7 +3593,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
sector_t first_bad, sector;
- int bad_sectors;
+ sector_t bad_sectors;
struct md_rdev *rdev;
if (r10_bio->devs[i].repl_bio)
@@ -3723,7 +3700,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
conf->cluster_sync_low = mddev->curr_resync_completed;
raid10_set_cluster_sync_high(conf);
/* Send resync message */
- md_cluster_ops->resync_info_update(mddev,
+ mddev->cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
@@ -3756,7 +3733,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
if (broadcast_msg) {
raid10_set_cluster_sync_high(conf);
- md_cluster_ops->resync_info_update(mddev,
+ mddev->cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
@@ -3771,7 +3748,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->sectors = nr_sectors;
if (bio->bi_end_io == end_sync_read) {
- md_sync_acct_bio(bio, nr_sectors);
bio->bi_status = 0;
submit_bio_noacct(bio);
}
@@ -4023,13 +3999,15 @@ static int raid10_set_queue_limits(struct mddev *mddev)
md_init_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
+ lim.max_hw_wzeroes_unmap_sectors = 0;
+ lim.logical_block_size = mddev->logical_block_size;
lim.io_min = mddev->chunk_sectors << 9;
+ lim.chunk_sectors = mddev->chunk_sectors;
lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
+ lim.features |= BLK_FEAT_ATOMIC_WRITES;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
- if (err) {
- queue_limits_cancel_update(mddev->gendisk->queue);
+ if (err)
return err;
- }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
@@ -4160,7 +4138,7 @@ static int raid10_run(struct mddev *mddev)
disk->recovery_disabled = mddev->recovery_disabled - 1;
}
- if (mddev->recovery_cp != MaxSector)
+ if (mddev->resync_offset != MaxSector)
pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
mdname(mddev));
pr_info("md/raid10:%s: active with %d out of %d devices\n",
@@ -4240,7 +4218,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
*/
struct r10conf *conf = mddev->private;
sector_t oldsize, size;
- int ret;
if (mddev->reshape_position != MaxSector)
return -EBUSY;
@@ -4254,14 +4231,17 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
mddev->array_sectors > size)
return -EINVAL;
- ret = mddev->bitmap_ops->resize(mddev, size, 0, false);
- if (ret)
- return ret;
+ if (md_bitmap_enabled(mddev, false)) {
+ int ret = mddev->bitmap_ops->resize(mddev, size, 0);
+
+ if (ret)
+ return ret;
+ }
md_set_array_sectors(mddev, size);
if (sectors > mddev->dev_sectors &&
- mddev->recovery_cp > oldsize) {
- mddev->recovery_cp = oldsize;
+ mddev->resync_offset > oldsize) {
+ mddev->resync_offset = oldsize;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
calc_sectors(conf, sectors);
@@ -4290,7 +4270,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
mddev->delta_disks = mddev->raid_disks;
mddev->raid_disks *= 2;
/* make sure it will be not marked as dirty */
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
mddev->dev_sectors = size;
conf = setup_conf(mddev);
@@ -4522,8 +4502,9 @@ static int raid10_start_reshape(struct mddev *mddev)
oldsize = raid10_size(mddev, 0, 0);
newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
- if (!mddev_is_clustered(mddev)) {
- ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
+ if (!mddev_is_clustered(mddev) &&
+ md_bitmap_enabled(mddev, false)) {
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0);
if (ret)
goto abort;
else
@@ -4545,13 +4526,14 @@ static int raid10_start_reshape(struct mddev *mddev)
MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
goto out;
- ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
+ /* cluster can't be setup without bitmap */
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0);
if (ret)
goto abort;
- ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
+ ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
if (ret) {
- mddev->bitmap_ops->resize(mddev, oldsize, 0, false);
+ mddev->bitmap_ops->resize(mddev, oldsize, 0);
goto abort;
}
}
@@ -4840,7 +4822,7 @@ read_more:
conf->cluster_sync_low = sb_reshape_pos;
}
- md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
+ mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
conf->cluster_sync_high);
}
@@ -4895,7 +4877,6 @@ read_more:
r10_bio->sectors = nr_sectors;
/* Now submit the read */
- md_sync_acct_bio(read_bio, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
submit_bio_noacct(read_bio);
@@ -4955,7 +4936,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
atomic_inc(&rdev->nr_pending);
- md_sync_acct_bio(b, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
b->bi_next = NULL;
submit_bio_noacct(b);
@@ -4985,7 +4965,7 @@ static void raid10_update_reshape_pos(struct mddev *mddev)
struct r10conf *conf = mddev->private;
sector_t lo, hi;
- md_cluster_ops->resync_info_get(mddev, &lo, &hi);
+ mddev->cluster_ops->resync_info_get(mddev, &lo, &hi);
if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
|| mddev->reshape_position == MaxSector)
conf->reshape_progress = mddev->reshape_position;
@@ -5104,8 +5084,8 @@ static void raid10_finish_reshape(struct mddev *mddev)
return;
if (mddev->delta_disks > 0) {
- if (mddev->recovery_cp > mddev->resync_max_sectors) {
- mddev->recovery_cp = mddev->resync_max_sectors;
+ if (mddev->resync_offset > mddev->resync_max_sectors) {
+ mddev->resync_offset = mddev->resync_max_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
mddev->resync_max_sectors = mddev->array_sectors;
@@ -5131,9 +5111,13 @@ static void raid10_finish_reshape(struct mddev *mddev)
static struct md_personality raid10_personality =
{
- .name = "raid10",
- .level = 10,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID10,
+ .name = "raid10",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid10_make_request,
.run = raid10_run,
.free = raid10_free,
@@ -5153,18 +5137,18 @@ static struct md_personality raid10_personality =
.update_reshape_pos = raid10_update_reshape_pos,
};
-static int __init raid_init(void)
+static int __init raid10_init(void)
{
- return register_md_personality(&raid10_personality);
+ return register_md_submodule(&raid10_personality.head);
}
-static void raid_exit(void)
+static void __exit raid10_exit(void)
{
- unregister_md_personality(&raid10_personality);
+ unregister_md_submodule(&raid10_personality.head);
}
-module_init(raid_init);
-module_exit(raid_exit);
+module_init(raid10_init);
+module_exit(raid10_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
MODULE_ALIAS("md-personality-9"); /* RAID10 */
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 2e75e88d0802..da00a55f7a55 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -161,11 +161,12 @@ enum r10bio_state {
R10BIO_IsSync,
R10BIO_IsRecover,
R10BIO_IsReshape,
- R10BIO_Degraded,
/* Set ReadError on bios that experience a read error
* so that raid10d knows what to do with them.
*/
R10BIO_ReadError,
+/* For bio_split errors, record that bi_end_io was called. */
+ R10BIO_Returned,
/* If a write for this request means we can clear some
* known-bad-block records, we set this flag.
*/
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index b4f7b79fd187..e29e69335c69 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -313,10 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- false);
}
}
}
@@ -718,7 +714,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
block = page_address(io->meta_page);
block->meta_size = cpu_to_le32(io->meta_offset);
- crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
+ crc = crc32c(log->uuid_checksum, block, PAGE_SIZE);
block->checksum = cpu_to_le32(crc);
log->current_io = NULL;
@@ -1023,10 +1019,10 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
/* checksum is already calculated in last run */
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
continue;
- addr = kmap_atomic(sh->dev[i].page);
- sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
- addr, PAGE_SIZE);
- kunmap_atomic(addr);
+ addr = kmap_local_page(sh->dev[i].page);
+ sh->dev[i].log_checksum = crc32c(log->uuid_checksum,
+ addr, PAGE_SIZE);
+ kunmap_local(addr);
}
parity_pages = 1 + !!(sh->qd_idx >= 0);
data_pages = write_disks - parity_pages;
@@ -1745,7 +1741,7 @@ static int r5l_recovery_read_meta_block(struct r5l_log *log,
le64_to_cpu(mb->position) != ctx->pos)
return -EINVAL;
- crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ crc = crc32c(log->uuid_checksum, mb, PAGE_SIZE);
if (stored_crc != crc)
return -EINVAL;
@@ -1784,8 +1780,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
return -ENOMEM;
r5l_recovery_create_empty_meta_block(log, page, pos, seq);
mb = page_address(page);
- mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
- mb, PAGE_SIZE));
+ mb->checksum = cpu_to_le32(crc32c(log->uuid_checksum, mb, PAGE_SIZE));
if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE |
REQ_SYNC | REQ_FUA, false)) {
__free_page(page);
@@ -1979,9 +1974,9 @@ r5l_recovery_verify_data_checksum(struct r5l_log *log,
u32 checksum;
r5l_recovery_read_page(log, ctx, page, log_offset);
- addr = kmap_atomic(page);
- checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
- kunmap_atomic(addr);
+ addr = kmap_local_page(page);
+ checksum = crc32c(log->uuid_checksum, addr, PAGE_SIZE);
+ kunmap_local(addr);
return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
}
@@ -2381,11 +2376,11 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
payload->size = cpu_to_le32(BLOCK_SECTORS);
payload->location = cpu_to_le64(
raid5_compute_blocknr(sh, i, 0));
- addr = kmap_atomic(dev->page);
+ addr = kmap_local_page(dev->page);
payload->checksum[0] = cpu_to_le32(
- crc32c_le(log->uuid_checksum, addr,
- PAGE_SIZE));
- kunmap_atomic(addr);
+ crc32c(log->uuid_checksum, addr,
+ PAGE_SIZE));
+ kunmap_local(addr);
sync_page_io(log->rdev, write_pos, PAGE_SIZE,
dev->page, REQ_OP_WRITE, false);
write_pos = r5l_ring_add(log, write_pos,
@@ -2396,8 +2391,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
}
}
mb->meta_size = cpu_to_le32(offset);
- mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
- mb, PAGE_SIZE));
+ mb->checksum = cpu_to_le32(crc32c(log->uuid_checksum,
+ mb, PAGE_SIZE));
sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false);
sh->log_start = ctx->pos;
@@ -2888,10 +2883,10 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
continue;
- addr = kmap_atomic(sh->dev[i].page);
- sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
- addr, PAGE_SIZE);
- kunmap_atomic(addr);
+ addr = kmap_local_page(sh->dev[i].page);
+ sh->dev[i].log_checksum = crc32c(log->uuid_checksum,
+ addr, PAGE_SIZE);
+ kunmap_local(addr);
pages++;
}
WARN_ON(pages == 0);
@@ -2973,7 +2968,7 @@ static int r5l_load_log(struct r5l_log *log)
}
stored_crc = le32_to_cpu(mb->checksum);
mb->checksum = 0;
- expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ expected_crc = crc32c(log->uuid_checksum, mb, PAGE_SIZE);
if (stored_crc != expected_crc) {
create_super = true;
goto create;
@@ -3081,8 +3076,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
return -ENOMEM;
log->rdev = rdev;
log->need_cache_flush = bdev_write_cache(rdev->bdev);
- log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
- sizeof(rdev->mddev->uuid));
+ log->uuid_checksum = crc32c(~0, rdev->mddev->uuid,
+ sizeof(rdev->mddev->uuid));
mutex_init(&log->io_mutex);
@@ -3109,7 +3104,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
goto out_mempool;
spin_lock_init(&log->tree_lock);
- INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
+ INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT);
thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev,
"reclaim");
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 37c4da5311ca..56b234683ee6 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -346,9 +346,9 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
le32_add_cpu(&e->pp_size, PAGE_SIZE);
io->pp_size += PAGE_SIZE;
- e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
- page_address(sh->ppl_page),
- PAGE_SIZE));
+ e->checksum = cpu_to_le32(crc32c(le32_to_cpu(e->checksum),
+ page_address(sh->ppl_page),
+ PAGE_SIZE));
}
list_add_tail(&sh->log_list, &io->stripe_list);
@@ -454,7 +454,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
}
pplhdr->entries_count = cpu_to_le32(io->entries_count);
- pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
+ pplhdr->checksum = cpu_to_le32(~crc32c(~0, pplhdr, PPL_HEADER_SIZE));
/* Rewind the buffer if current PPL is larger then remaining space */
if (log->use_multippl &&
@@ -998,7 +998,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr,
goto out;
}
- crc = crc32c_le(crc, page_address(page), s);
+ crc = crc32c(crc, page_address(page), s);
pp_size -= s;
sector += s >> 9;
@@ -1052,7 +1052,7 @@ static int ppl_write_empty_header(struct ppl_log *log)
log->rdev->ppl.size, GFP_NOIO, 0);
memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
- pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+ pplhdr->checksum = cpu_to_le32(~crc32c(~0, pplhdr, PAGE_SIZE));
if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC |
@@ -1106,7 +1106,7 @@ static int ppl_load_distributed(struct ppl_log *log)
/* check header validity */
crc_stored = le32_to_cpu(pplhdr->checksum);
pplhdr->checksum = 0;
- crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+ crc = ~crc32c(~0, pplhdr, PAGE_SIZE);
if (crc_stored != crc) {
pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n",
@@ -1163,7 +1163,7 @@ static int ppl_load_distributed(struct ppl_log *log)
le64_to_cpu(pplhdr->generation));
/* attempt to recover from log if we are starting a dirty array */
- if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector)
+ if (pplhdr && !mddev->pers && mddev->resync_offset != MaxSector)
ret = ppl_recover(log, pplhdr, pplhdr_offset);
/* write empty header if we are starting the array */
@@ -1390,7 +1390,7 @@ int ppl_init_log(struct r5conf *conf)
spin_lock_init(&ppl_conf->no_mem_stripes_lock);
if (!mddev->external) {
- ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
+ ppl_conf->signature = ~crc32c(~0, mddev->uuid, sizeof(mddev->uuid));
ppl_conf->block_size = 512;
} else {
ppl_conf->block_size =
@@ -1422,14 +1422,14 @@ int ppl_init_log(struct r5conf *conf)
if (ret) {
goto err;
- } else if (!mddev->pers && mddev->recovery_cp == 0 &&
+ } else if (!mddev->pers && mddev->resync_offset == 0 &&
ppl_conf->recovered_entries > 0 &&
ppl_conf->mismatch_count == 0) {
/*
* If we are starting a dirty array and the recovery succeeds
* without any issues, set the array as clean.
*/
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
} else if (mddev->pers && ppl_conf->mismatch_count > 0) {
/* no mismatch allowed when enabling PPL for a running array */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f09e7677ee9f..e57ce3295292 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -906,8 +906,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
- !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
- is_full_stripe_write(sh);
+ is_full_stripe_write(sh);
}
/* we only do back search */
@@ -1241,10 +1240,6 @@ again:
}
if (rdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
@@ -1301,10 +1296,6 @@ again:
submit_bio_noacct(bi);
}
if (rrdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);
@@ -1345,8 +1336,6 @@ again:
submit_bio_noacct(rbi);
}
if (!rdev && !rrdev) {
- if (op_is_write(op))
- set_bit(STRIPE_DEGRADED, &sh->state);
pr_debug("skip op %d on disc %d for sector %llu\n",
bi->bi_opf, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -2884,7 +2873,6 @@ static void raid5_end_write_request(struct bio *bi)
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (bi->bi_status) {
- set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@ -3548,29 +3536,9 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
(*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
sh->dev[dd_idx].sector);
- if (conf->mddev->bitmap && firstwrite) {
- /* Cannot hold spinlock over bitmap_startwrite,
- * but must ensure this isn't added to a batch until
- * we have added to the bitmap and set bm_seq.
- * So set STRIPE_BITMAP_PENDING to prevent
- * batching.
- * If multiple __add_stripe_bio() calls race here they
- * much all set STRIPE_BITMAP_PENDING. So only the first one
- * to complete "bitmap_startwrite" gets to set
- * STRIPE_BIT_DELAY. This is important as once a stripe
- * is added to a batch, STRIPE_BIT_DELAY cannot be changed
- * any more.
- */
- set_bit(STRIPE_BITMAP_PENDING, &sh->state);
- spin_unlock_irq(&sh->stripe_lock);
- conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf), false);
- spin_lock_irq(&sh->stripe_lock);
- clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
- if (!sh->batch_head) {
- sh->bm_seq = conf->seq_flush+1;
- set_bit(STRIPE_BIT_DELAY, &sh->state);
- }
+ if (conf->mddev->bitmap && firstwrite && !sh->batch_head) {
+ sh->bm_seq = conf->seq_flush+1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
}
}
@@ -3621,7 +3589,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
for (i = disks; i--; ) {
struct bio *bi;
- int bitmap_end = 0;
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
struct md_rdev *rdev = conf->disks[i].rdev;
@@ -3646,8 +3613,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].towrite = NULL;
sh->overwrite_disks = 0;
spin_unlock_irq(&sh->stripe_lock);
- if (bi)
- bitmap_end = 1;
log_stripe_write_finished(sh);
@@ -3662,11 +3627,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bio_io_error(bi);
bi = nextbi;
}
- if (bitmap_end)
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- false, false);
- bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
@@ -3675,7 +3635,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].page = sh->dev[i].orig_page;
}
- if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
@@ -3709,10 +3668,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi = nextbi;
}
}
- if (bitmap_end)
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- false, false);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -3785,7 +3740,7 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& (rdev->recovery_offset <= sh->sector
- || rdev->mddev->recovery_cp <= sh->sector))
+ || rdev->mddev->resync_offset <= sh->sector))
rv = 1;
return rv;
}
@@ -3877,7 +3832,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
* is missing/faulty, then we need to read everything we can.
*/
if (!force_rcw &&
- sh->sector < sh->raid_conf->mddev->recovery_cp)
+ sh->sector < sh->raid_conf->mddev->resync_offset)
/* reconstruct-write isn't being forced */
return 0;
for (i = 0; i < s->failed && i < 2; i++) {
@@ -4061,10 +4016,7 @@ returnbi:
bio_endio(wbi);
wbi = wbi2;
}
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- false);
+
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
@@ -4145,7 +4097,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
int disks)
{
int rmw = 0, rcw = 0, i;
- sector_t recovery_cp = conf->mddev->recovery_cp;
+ struct mddev *mddev = conf->mddev;
+ sector_t resync_offset = mddev->resync_offset;
/* Check whether resync is now happening or should start.
* If yes, then the array is dirty (after unclean shutdown or
@@ -4155,15 +4108,21 @@ static int handle_stripe_dirtying(struct r5conf *conf,
* generate correct data from the parity.
*/
if (conf->rmw_level == PARITY_DISABLE_RMW ||
- (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
+ (resync_offset < MaxSector && sh->sector >= resync_offset &&
s->failed == 0)) {
/* Calculate the real rcw later - for now make it
* look like rcw is cheaper
*/
rcw = 1; rmw = 2;
- pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
- conf->rmw_level, (unsigned long long)recovery_cp,
+ pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n",
+ conf->rmw_level, (unsigned long long)resync_offset,
(unsigned long long)sh->sector);
+ } else if (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced &&
+ !mddev->bitmap_ops->blocks_synced(mddev, sh->sector)) {
+ /* The initial recover is not done, must read everything */
+ rcw = 1; rmw = 2;
+ pr_debug("force RCW by lazy recovery, sh->sector=%llu\n",
+ sh->sector);
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
@@ -4196,7 +4155,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
- mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d",
+ mddev_add_trace_msg(mddev, "raid5 rmw %llu %d",
sh->sector, rmw);
for (i = disks; i--; ) {
@@ -4275,8 +4234,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_DELAYED, &sh->state);
}
}
- if (rcw && !mddev_is_dm(conf->mddev))
- blk_add_trace_msg(conf->mddev->gendisk->queue,
+ if (rcw && !mddev_is_dm(mddev))
+ blk_add_trace_msg(mddev->gendisk->queue,
"raid5 rcw %llu %d %d %d",
(unsigned long long)sh->sector, rcw, qread,
test_bit(STRIPE_DELAYED, &sh->state));
@@ -4341,7 +4300,6 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
s->locked++;
set_bit(R5_Wantwrite, &dev->flags);
- clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(STRIPE_INSYNC, &sh->state);
break;
case check_state_run:
@@ -4498,7 +4456,6 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
clear_bit(R5_Wantwrite, &dev->flags);
s->locked--;
}
- clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(STRIPE_INSYNC, &sh->state);
break;
@@ -4748,10 +4705,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
}
} else if (test_bit(In_sync, &rdev->flags))
set_bit(R5_Insync, &dev->flags);
- else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
- /* in sync if before recovery_offset */
- set_bit(R5_Insync, &dev->flags);
- else if (test_bit(R5_UPTODATE, &dev->flags) &&
+ else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <=
+ rdev->recovery_offset) {
+ /*
+ * in sync if:
+ * - normal IO, or
+ * - resync IO that is not lazy recovery
+ *
+ * For lazy recovery, we have to mark the rdev without
+ * In_sync as failed, to build initial xor data.
+ */
+ if (!test_bit(STRIPE_SYNCING, &sh->state) ||
+ !test_bit(MD_RECOVERY_LAZY_RECOVER,
+ &conf->mddev->recovery))
+ set_bit(R5_Insync, &dev->flags);
+ } else if (test_bit(R5_UPTODATE, &dev->flags) &&
test_bit(R5_Expanded, &dev->flags))
/* If we've reshaped into here, we assume it is Insync.
* We will shortly update recovery_offset to make
@@ -4820,14 +4788,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (test_bit(STRIPE_SYNCING, &sh->state)) {
/* If there is a failed device being replaced,
* we must be recovering.
- * else if we are after recovery_cp, we must be syncing
+ * else if we are after resync_offset, we must be syncing
* else if MD_RECOVERY_REQUESTED is set, we also are syncing.
* else we can only be replacing
* sync and recovery both need to read all devices, and so
* use the same flag.
*/
if (do_recovery ||
- sh->sector >= conf->mddev->recovery_cp ||
+ sh->sector >= conf->mddev->resync_offset ||
test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
s->syncing = 1;
else
@@ -4891,8 +4859,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
(1 << STRIPE_COMPUTE_RUN) |
(1 << STRIPE_DISCARD) |
(1 << STRIPE_BATCH_READY) |
- (1 << STRIPE_BATCH_ERR) |
- (1 << STRIPE_BITMAP_PENDING)),
+ (1 << STRIPE_BATCH_ERR)),
"stripe state: %lx\n", sh->state);
WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
(1 << STRIPE_REPLACED)),
@@ -4900,7 +4867,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
(1 << STRIPE_PREREAD_ACTIVE) |
- (1 << STRIPE_DEGRADED) |
(1 << STRIPE_ON_UNPLUG_LIST)),
head_sh->state & (1 << STRIPE_INSYNC));
@@ -4990,7 +4956,8 @@ static void handle_stripe(struct stripe_head *sh)
goto finish;
if (s.handle_bad_blocks ||
- test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
+ (md_is_rdwr(conf->mddev) &&
+ test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags))) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
}
@@ -5520,17 +5487,17 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
{
- struct bio *split;
sector_t sector = raid_bio->bi_iter.bi_sector;
unsigned chunk_sects = mddev->chunk_sectors;
unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
if (sectors < bio_sectors(raid_bio)) {
struct r5conf *conf = mddev->private;
- split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
- bio_chain(split, raid_bio);
- submit_bio_noacct(raid_bio);
- raid_bio = split;
+
+ raid_bio = bio_submit_split_bioset(raid_bio, sectors,
+ &conf->bio_split);
+ if (!raid_bio)
+ return NULL;
}
if (!raid5_read_one_chunk(mddev, raid_bio))
@@ -5784,10 +5751,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
- for (d = 0; d < conf->raid_disks - conf->max_degraded;
- d++)
- mddev->bitmap_ops->startwrite(mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf), false);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
@@ -5906,6 +5869,9 @@ static enum reshape_loc get_reshape_loc(struct mddev *mddev,
struct r5conf *conf, sector_t logical_sector)
{
sector_t reshape_progress, reshape_safe;
+
+ if (likely(conf->reshape_progress == MaxSector))
+ return LOC_NO_RESHAPE;
/*
* Spinlock is needed as reshape_progress may be
* 64bit on a 32bit platform, and so it might be
@@ -5928,6 +5894,54 @@ static enum reshape_loc get_reshape_loc(struct mddev *mddev,
return LOC_BEHIND_RESHAPE;
}
+static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset,
+ unsigned long *sectors)
+{
+ struct r5conf *conf = mddev->private;
+ sector_t start = *offset;
+ sector_t end = start + *sectors;
+ sector_t prev_start = start;
+ sector_t prev_end = end;
+ int sectors_per_chunk;
+ enum reshape_loc loc;
+ int dd_idx;
+
+ sectors_per_chunk = conf->chunk_sectors *
+ (conf->raid_disks - conf->max_degraded);
+ start = round_down(start, sectors_per_chunk);
+ end = round_up(end, sectors_per_chunk);
+
+ start = raid5_compute_sector(conf, start, 0, &dd_idx, NULL);
+ end = raid5_compute_sector(conf, end, 0, &dd_idx, NULL);
+
+ /*
+ * For LOC_INSIDE_RESHAPE, this IO will wait for reshape to make
+ * progress, hence it's the same as LOC_BEHIND_RESHAPE.
+ */
+ loc = get_reshape_loc(mddev, conf, prev_start);
+ if (likely(loc != LOC_AHEAD_OF_RESHAPE)) {
+ *offset = start;
+ *sectors = end - start;
+ return;
+ }
+
+ sectors_per_chunk = conf->prev_chunk_sectors *
+ (conf->previous_raid_disks - conf->max_degraded);
+ prev_start = round_down(prev_start, sectors_per_chunk);
+ prev_end = round_down(prev_end, sectors_per_chunk);
+
+ prev_start = raid5_compute_sector(conf, prev_start, 1, &dd_idx, NULL);
+ prev_end = raid5_compute_sector(conf, prev_end, 1, &dd_idx, NULL);
+
+ /*
+ * for LOC_AHEAD_OF_RESHAPE, reshape can make progress before this IO
+ * is handled in make_stripe_request(), we can't know this here hence
+ * we set bits for both.
+ */
+ *offset = min(start, prev_start);
+ *sectors = max(end, prev_end) - *offset;
+}
+
static enum stripe_result make_stripe_request(struct mddev *mddev,
struct r5conf *conf, struct stripe_request_ctx *ctx,
sector_t logical_sector, struct bio *bi)
@@ -5935,22 +5949,19 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
const int rw = bio_data_dir(bi);
enum stripe_result ret;
struct stripe_head *sh;
+ enum reshape_loc loc;
sector_t new_sector;
int previous = 0, flags = 0;
int seq, dd_idx;
seq = read_seqcount_begin(&conf->gen_lock);
-
- if (unlikely(conf->reshape_progress != MaxSector)) {
- enum reshape_loc loc = get_reshape_loc(mddev, conf,
- logical_sector);
- if (loc == LOC_INSIDE_RESHAPE) {
- ret = STRIPE_SCHEDULE_AND_RETRY;
- goto out;
- }
- if (loc == LOC_AHEAD_OF_RESHAPE)
- previous = 1;
+ loc = get_reshape_loc(mddev, conf, logical_sector);
+ if (loc == LOC_INSIDE_RESHAPE) {
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out;
}
+ if (loc == LOC_AHEAD_OF_RESHAPE)
+ previous = 1;
new_sector = raid5_compute_sector(conf, logical_sector, previous,
&dd_idx, NULL);
@@ -6127,7 +6138,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
/* Bail out if conflicts with reshape and REQ_NOWAIT is set */
if ((bi->bi_opf & REQ_NOWAIT) &&
- (conf->reshape_progress != MaxSector) &&
get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) {
bio_wouldblock_error(bi);
if (rw == WRITE)
@@ -6501,11 +6511,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (mddev->curr_resync < max_sector) /* aborted */
- mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else /* completed sync */
conf->fullsync = 0;
- mddev->bitmap_ops->close_sync(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->close_sync(mddev);
return 0;
}
@@ -6534,8 +6545,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
- !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
- true) &&
+ !md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) &&
sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
@@ -6544,7 +6554,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
- mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
sh = raid5_get_active_stripe(conf, NULL, sector_nr,
R5_GAS_NOBLOCK);
@@ -6566,9 +6577,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
still_degraded = true;
}
- mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
- still_degraded);
-
+ md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, still_degraded);
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -6760,7 +6769,8 @@ static void raid5d(struct md_thread *thread)
int batch_size, released;
unsigned int offset;
- if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+ if (md_is_rdwr(mddev) &&
+ test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
break;
released = release_stripe_list(conf, conf->temp_inactive_list);
@@ -6772,7 +6782,8 @@ static void raid5d(struct md_thread *thread)
/* Now is a good time to flush some bitmap updates */
conf->seq_flush++;
spin_unlock_irq(&conf->device_lock);
- mddev->bitmap_ops->unplug(mddev, true);
+ if (md_bitmap_enabled(mddev, true))
+ mddev->bitmap_ops->unplug(mddev, true);
spin_lock_irq(&conf->device_lock);
conf->seq_write = conf->seq_flush;
activate_bit_delay(conf, conf->temp_inactive_list);
@@ -7736,11 +7747,13 @@ static int raid5_set_limits(struct mddev *mddev)
stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
md_init_stacking_limits(&lim);
+ lim.logical_block_size = mddev->logical_block_size;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE;
lim.discard_granularity = stripe;
lim.max_write_zeroes_sectors = 0;
+ lim.max_hw_wzeroes_unmap_sectors = 0;
mddev_stack_rdev_limits(mddev, &lim, 0);
rdev_for_each(rdev, mddev)
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset,
@@ -7789,7 +7802,7 @@ static int raid5_run(struct mddev *mddev)
int first = 1;
int ret = -EIO;
- if (mddev->recovery_cp != MaxSector)
+ if (mddev->resync_offset != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
mdname(mddev));
@@ -7930,7 +7943,7 @@ static int raid5_run(struct mddev *mddev)
mdname(mddev));
mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1);
- } else if (mddev->recovery_cp == MaxSector)
+ } else if (mddev->resync_offset == MaxSector)
set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
}
@@ -7997,7 +8010,7 @@ static int raid5_run(struct mddev *mddev)
mddev->resync_max_sectors = mddev->dev_sectors;
if (mddev->degraded > dirty_parity_disks &&
- mddev->recovery_cp != MaxSector) {
+ mddev->resync_offset != MaxSector) {
if (test_bit(MD_HAS_PPL, &mddev->flags))
pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
mdname(mddev));
@@ -8321,7 +8334,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
*/
sector_t newsize;
struct r5conf *conf = mddev->private;
- int ret;
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return -EINVAL;
@@ -8331,14 +8343,17 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
mddev->array_sectors > newsize)
return -EINVAL;
- ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false);
- if (ret)
- return ret;
+ if (md_bitmap_enabled(mddev, false)) {
+ int ret = mddev->bitmap_ops->resize(mddev, sectors, 0);
+
+ if (ret)
+ return ret;
+ }
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
- mddev->recovery_cp > mddev->dev_sectors) {
- mddev->recovery_cp = mddev->dev_sectors;
+ mddev->resync_offset > mddev->dev_sectors) {
+ mddev->resync_offset = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
mddev->dev_sectors = sectors;
@@ -8432,7 +8447,7 @@ static int raid5_start_reshape(struct mddev *mddev)
return -EINVAL;
/* raid5 can't handle concurrent reshape and recovery */
- if (mddev->recovery_cp < MaxSector)
+ if (mddev->resync_offset < MaxSector)
return -EBUSY;
for (i = 0; i < conf->raid_disks; i++)
if (conf->disks[i].replacement)
@@ -8657,7 +8672,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
mddev->raid_disks += 1;
mddev->delta_disks = 1;
/* make sure it will be not marked as dirty */
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
return setup_conf(mddev);
}
@@ -8954,9 +8969,13 @@ static void raid5_prepare_suspend(struct mddev *mddev)
static struct md_personality raid6_personality =
{
- .name = "raid6",
- .level = 6,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID6,
+ .name = "raid6",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -8976,12 +8995,17 @@ static struct md_personality raid6_personality =
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static struct md_personality raid5_personality =
{
- .name = "raid5",
- .level = 5,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID5,
+ .name = "raid5",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -9001,13 +9025,18 @@ static struct md_personality raid5_personality =
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static struct md_personality raid4_personality =
{
- .name = "raid4",
- .level = 4,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID4,
+ .name = "raid4",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -9027,6 +9056,7 @@ static struct md_personality raid4_personality =
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static int __init raid5_init(void)
@@ -9034,7 +9064,7 @@ static int __init raid5_init(void)
int ret;
raid5_wq = alloc_workqueue("raid5wq",
- WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
+ WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_SYSFS, 0);
if (!raid5_wq)
return -ENOMEM;
@@ -9042,21 +9072,39 @@ static int __init raid5_init(void)
"md/raid5:prepare",
raid456_cpu_up_prepare,
raid456_cpu_dead);
- if (ret) {
- destroy_workqueue(raid5_wq);
- return ret;
- }
- register_md_personality(&raid6_personality);
- register_md_personality(&raid5_personality);
- register_md_personality(&raid4_personality);
+ if (ret)
+ goto err_destroy_wq;
+
+ ret = register_md_submodule(&raid6_personality.head);
+ if (ret)
+ goto err_cpuhp_remove;
+
+ ret = register_md_submodule(&raid5_personality.head);
+ if (ret)
+ goto err_unregister_raid6;
+
+ ret = register_md_submodule(&raid4_personality.head);
+ if (ret)
+ goto err_unregister_raid5;
+
return 0;
+
+err_unregister_raid5:
+ unregister_md_submodule(&raid5_personality.head);
+err_unregister_raid6:
+ unregister_md_submodule(&raid6_personality.head);
+err_cpuhp_remove:
+ cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
+err_destroy_wq:
+ destroy_workqueue(raid5_wq);
+ return ret;
}
-static void raid5_exit(void)
+static void __exit raid5_exit(void)
{
- unregister_md_personality(&raid6_personality);
- unregister_md_personality(&raid5_personality);
- unregister_md_personality(&raid4_personality);
+ unregister_md_submodule(&raid6_personality.head);
+ unregister_md_submodule(&raid5_personality.head);
+ unregister_md_submodule(&raid4_personality.head);
cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
destroy_workqueue(raid5_wq);
}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d174e586698f..eafc6e9ed6ee 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -358,7 +358,6 @@ enum {
STRIPE_REPLACED,
STRIPE_PREREAD_ACTIVE,
STRIPE_DELAYED,
- STRIPE_DEGRADED,
STRIPE_BIT_DELAY,
STRIPE_EXPANDING,
STRIPE_EXPAND_SOURCE,
@@ -372,9 +371,6 @@ enum {
STRIPE_ON_RELEASE_LIST,
STRIPE_BATCH_READY,
STRIPE_BATCH_ERR,
- STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
- * to batch yet.
- */
STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
* this bit is used in two scenarios:
*