summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig23
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/bcache/Kconfig1
-rw-r--r--drivers/md/bcache/alloc.c78
-rw-r--r--drivers/md/bcache/bcache.h3
-rw-r--r--drivers/md/bcache/bset.c76
-rw-r--r--drivers/md/bcache/bset.h14
-rw-r--r--drivers/md/bcache/btree.c36
-rw-r--r--drivers/md/bcache/extents.c45
-rw-r--r--drivers/md/bcache/movinggc.c35
-rw-r--r--drivers/md/bcache/request.c16
-rw-r--r--drivers/md/bcache/super.c35
-rw-r--r--drivers/md/bcache/sysfs.c2
-rw-r--r--drivers/md/bcache/util.c2
-rw-r--r--drivers/md/bcache/util.h67
-rw-r--r--drivers/md/bcache/writeback.c5
-rw-r--r--drivers/md/dm-bio-prison-v1.c35
-rw-r--r--drivers/md/dm-bio-prison-v1.h24
-rw-r--r--drivers/md/dm-bio-prison-v2.c3
-rw-r--r--drivers/md/dm-bufio.c40
-rw-r--r--drivers/md/dm-cache-background-tracker.c31
-rw-r--r--drivers/md/dm-cache-background-tracker.h9
-rw-r--r--drivers/md/dm-cache-metadata.c48
-rw-r--r--drivers/md/dm-cache-metadata.h3
-rw-r--r--drivers/md/dm-cache-target.c116
-rw-r--r--drivers/md/dm-clone-metadata.c16
-rw-r--r--drivers/md/dm-clone-target.c27
-rw-r--r--drivers/md/dm-core.h5
-rw-r--r--drivers/md/dm-crypt.c239
-rw-r--r--drivers/md/dm-delay.c60
-rw-r--r--drivers/md/dm-ebs-target.c4
-rw-r--r--drivers/md/dm-era-target.c13
-rw-r--r--drivers/md/dm-init.c4
-rw-r--r--drivers/md/dm-integrity.c686
-rw-r--r--drivers/md/dm-io.c85
-rw-r--r--drivers/md/dm-ioctl.c26
-rw-r--r--drivers/md/dm-linear.c6
-rw-r--r--drivers/md/dm-log-writes.c2
-rw-r--r--drivers/md/dm-mpath.c14
-rw-r--r--drivers/md/dm-ps-io-affinity.c2
-rw-r--r--drivers/md/dm-raid.c76
-rw-r--r--drivers/md/dm-raid1.c5
-rw-r--r--drivers/md/dm-rq.c6
-rw-r--r--drivers/md/dm-snap.c2
-rw-r--r--drivers/md/dm-stripe.c10
-rw-r--r--drivers/md/dm-table.c444
-rw-r--r--drivers/md/dm-target.c1
-rw-r--r--drivers/md/dm-thin-metadata.c6
-rw-r--r--drivers/md/dm-thin.c32
-rw-r--r--drivers/md/dm-unstripe.c4
-rw-r--r--drivers/md/dm-vdo/Kconfig1
-rw-r--r--drivers/md/dm-vdo/Makefile2
-rw-r--r--drivers/md/dm-vdo/block-map.c2
-rw-r--r--drivers/md/dm-vdo/data-vio.c54
-rw-r--r--drivers/md/dm-vdo/data-vio.h5
-rw-r--r--drivers/md/dm-vdo/dedupe.c17
-rw-r--r--drivers/md/dm-vdo/dm-vdo-target.c39
-rw-r--r--drivers/md/dm-vdo/encodings.c2
-rw-r--r--drivers/md/dm-vdo/flush.c3
-rw-r--r--drivers/md/dm-vdo/indexer/chapter-index.c2
-rw-r--r--drivers/md/dm-vdo/indexer/index-layout.c26
-rw-r--r--drivers/md/dm-vdo/indexer/index.c5
-rw-r--r--drivers/md/dm-vdo/indexer/indexer.h4
-rw-r--r--drivers/md/dm-vdo/indexer/io-factory.c2
-rw-r--r--drivers/md/dm-vdo/int-map.c30
-rw-r--r--drivers/md/dm-vdo/io-submitter.c3
-rw-r--r--drivers/md/dm-vdo/message-stats.c48
-rw-r--r--drivers/md/dm-vdo/message-stats.h1
-rw-r--r--drivers/md/dm-vdo/murmurhash3.c9
-rw-r--r--drivers/md/dm-vdo/numeric.h2
-rw-r--r--drivers/md/dm-vdo/packer.c3
-rw-r--r--drivers/md/dm-vdo/physical-zone.c2
-rw-r--r--drivers/md/dm-vdo/recovery-journal.c2
-rw-r--r--drivers/md/dm-vdo/repair.c70
-rw-r--r--drivers/md/dm-vdo/slab-depot.c31
-rw-r--r--drivers/md/dm-vdo/status-codes.c2
-rw-r--r--drivers/md/dm-vdo/status-codes.h2
-rw-r--r--drivers/md/dm-vdo/vdo.c4
-rw-r--r--drivers/md/dm-vdo/vio.c1
-rw-r--r--drivers/md/dm-verity-fec.c93
-rw-r--r--drivers/md/dm-verity-fec.h6
-rw-r--r--drivers/md/dm-verity-target.c705
-rw-r--r--drivers/md/dm-verity-verify-sig.c7
-rw-r--r--drivers/md/dm-verity.h46
-rw-r--r--drivers/md/dm-zero.c1
-rw-r--r--drivers/md/dm-zone.c629
-rw-r--r--drivers/md/dm-zoned-metadata.c50
-rw-r--r--drivers/md/dm-zoned-reclaim.c6
-rw-r--r--drivers/md/dm-zoned-target.c7
-rw-r--r--drivers/md/dm-zoned.h2
-rw-r--r--drivers/md/dm.c400
-rw-r--r--drivers/md/dm.h16
-rw-r--r--drivers/md/md-autodetect.c8
-rw-r--r--drivers/md/md-bitmap.c696
-rw-r--r--drivers/md/md-bitmap.h271
-rw-r--r--drivers/md/md-cluster.c144
-rw-r--r--drivers/md/md-cluster.h2
-rw-r--r--drivers/md/md-linear.c352
-rw-r--r--drivers/md/md.c999
-rw-r--r--drivers/md/md.h181
-rw-r--r--drivers/md/persistent-data/dm-array.c25
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c12
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h14
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h2
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c6
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c14
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c4
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c62
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h6
-rw-r--r--drivers/md/raid0.c43
-rw-r--r--drivers/md/raid1-10.c9
-rw-r--r--drivers/md/raid1.c298
-rw-r--r--drivers/md/raid1.h1
-rw-r--r--drivers/md/raid10.c228
-rw-r--r--drivers/md/raid10.h1
-rw-r--r--drivers/md/raid5-cache.c26
-rw-r--r--drivers/md/raid5-ppl.c2
-rw-r--r--drivers/md/raid5.c384
-rw-r--r--drivers/md/raid5.h8
119 files changed, 5000 insertions, 3664 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 35b1080752cd..0b1870a09e1f 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -61,6 +61,19 @@ config MD_BITMAP_FILE
various kernel APIs and can only work with files on a file system not
actually sitting on the MD device.
+config MD_LINEAR
+ tristate "Linear (append) mode"
+ depends on BLK_DEV_MD
+ help
+ If you say Y here, then your multiple devices driver will be able to
+ use the so-called linear mode, i.e. it will combine the hard disk
+ partitions by simply appending one to the other.
+
+ To compile this as a module, choose M here: the module
+ will be called linear.
+
+ If unsure, say Y.
+
config MD_RAID0
tristate "RAID-0 (striping) mode"
depends on BLK_DEV_MD
@@ -540,6 +553,16 @@ config DM_VERITY_VERIFY_ROOTHASH_SIG_SECONDARY_KEYRING
If unsure, say N.
+config DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING
+ bool "Verity data device root hash signature verification with platform keyring"
+ default DM_VERITY_VERIFY_ROOTHASH_SIG_SECONDARY_KEYRING
+ depends on DM_VERITY_VERIFY_ROOTHASH_SIG
+ depends on INTEGRITY_PLATFORM_KEYRING
+ help
+ Rely also on the platform keyring to verify dm-verity signatures.
+
+ If unsure, say N.
+
config DM_VERITY_FEC
bool "Verity forward error correction support"
depends on DM_VERITY
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 476a214e4bdc..87bdfc9fe14c 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,12 +29,14 @@ dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
md-mod-y += md.o md-bitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
+linear-y += md-linear.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
# themselves, and md.o may use the personalities when it
# auto-initialised.
+obj-$(CONFIG_MD_LINEAR) += linear.o
obj-$(CONFIG_MD_RAID0) += raid0.o
obj-$(CONFIG_MD_RAID1) += raid1.o
obj-$(CONFIG_MD_RAID10) += raid10.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index b2d10063d35f..d4697e79d5a3 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -5,6 +5,7 @@ config BCACHE
select BLOCK_HOLDER_DEPRECATED if SYSFS
select CRC64
select CLOSURES
+ select MIN_HEAP
help
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ce13c272c387..8998e61efa40 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -129,12 +129,9 @@ static inline bool can_inc_bucket_gen(struct bucket *b)
bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
{
- BUG_ON(!ca->set->gc_mark_valid);
-
- return (!GC_MARK(b) ||
- GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
- !atomic_read(&b->pin) &&
- can_inc_bucket_gen(b);
+ return (ca->set->gc_mark_valid || b->reclaimable_in_gc) &&
+ ((!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
+ !atomic_read(&b->pin) && can_inc_bucket_gen(b));
}
void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -148,6 +145,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
bch_inc_gen(ca, b);
b->prio = INITIAL_PRIO;
atomic_inc(&b->pin);
+ b->reclaimable_in_gc = 0;
}
static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -166,40 +164,61 @@ static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
* prio is worth 1/8th of what INITIAL_PRIO is worth.
*/
-#define bucket_prio(b) \
-({ \
- unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \
- \
- (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \
-})
+static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b)
+{
+ unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;
-#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
-#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
+ return (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b);
+}
+
+static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args)
+{
+ struct bucket **lhs = (struct bucket **)l;
+ struct bucket **rhs = (struct bucket **)r;
+ struct cache *ca = args;
+
+ return new_bucket_prio(ca, *lhs) > new_bucket_prio(ca, *rhs);
+}
+
+static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args)
+{
+ struct bucket **lhs = (struct bucket **)l;
+ struct bucket **rhs = (struct bucket **)r;
+ struct cache *ca = args;
+
+ return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs);
+}
static void invalidate_buckets_lru(struct cache *ca)
{
struct bucket *b;
- ssize_t i;
+ const struct min_heap_callbacks bucket_max_cmp_callback = {
+ .less = new_bucket_max_cmp,
+ .swp = NULL,
+ };
+ const struct min_heap_callbacks bucket_min_cmp_callback = {
+ .less = new_bucket_min_cmp,
+ .swp = NULL,
+ };
- ca->heap.used = 0;
+ ca->heap.nr = 0;
for_each_bucket(b, ca) {
if (!bch_can_invalidate_bucket(ca, b))
continue;
- if (!heap_full(&ca->heap))
- heap_add(&ca->heap, b, bucket_max_cmp);
- else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
+ if (!min_heap_full(&ca->heap))
+ min_heap_push(&ca->heap, &b, &bucket_max_cmp_callback, ca);
+ else if (!new_bucket_max_cmp(&b, min_heap_peek(&ca->heap), ca)) {
ca->heap.data[0] = b;
- heap_sift(&ca->heap, 0, bucket_max_cmp);
+ min_heap_sift_down(&ca->heap, 0, &bucket_max_cmp_callback, ca);
}
}
- for (i = ca->heap.used / 2 - 1; i >= 0; --i)
- heap_sift(&ca->heap, i, bucket_min_cmp);
+ min_heapify_all(&ca->heap, &bucket_min_cmp_callback, ca);
while (!fifo_full(&ca->free_inc)) {
- if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
+ if (!ca->heap.nr) {
/*
* We don't want to be calling invalidate_buckets()
* multiple times when it can't do anything
@@ -208,6 +227,8 @@ static void invalidate_buckets_lru(struct cache *ca)
wake_up_gc(ca->set);
return;
}
+ b = min_heap_peek(&ca->heap)[0];
+ min_heap_pop(&ca->heap, &bucket_min_cmp_callback, ca);
bch_invalidate_one_bucket(ca, b);
}
@@ -352,8 +373,7 @@ static int bch_allocator_thread(void *arg)
*/
retry_invalidate:
- allocator_wait(ca, ca->set->gc_mark_valid &&
- !ca->invalidate_needs_gc);
+ allocator_wait(ca, !ca->invalidate_needs_gc);
invalidate_buckets(ca);
/*
@@ -501,8 +521,8 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
ca = c->cache;
b = bch_bucket_alloc(ca, reserve, wait);
- if (b == -1)
- goto err;
+ if (b < 0)
+ return -1;
k->ptr[0] = MAKE_PTR(ca->buckets[b].gen,
bucket_to_sector(c, b),
@@ -511,10 +531,6 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
SET_KEY_PTRS(k, 1);
return 0;
-err:
- bch_bucket_free(c, k);
- bkey_put(c, k);
- return -1;
}
int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 4e6afa89921f..785b0d9008fa 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -200,6 +200,7 @@ struct bucket {
uint8_t gen;
uint8_t last_gc; /* Most out of date gen in the btree */
uint16_t gc_mark; /* Bitfield used by GC. See below for field */
+ uint16_t reclaimable_in_gc:1;
};
/*
@@ -457,7 +458,7 @@ struct cache {
/* Allocation stuff: */
struct bucket *buckets;
- DECLARE_HEAP(struct bucket *, heap);
+ DEFINE_MIN_HEAP(struct bucket *, cache_heap) heap;
/*
* If nonzero, we know we aren't going to find any buckets to invalidate
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 2bba4d6aaaa2..68258a16e125 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -57,6 +57,8 @@ int __bch_count_data(struct btree_keys *b)
struct btree_iter iter;
struct bkey *k;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
+
if (b->ops->is_extents)
for_each_key(b, k, &iter)
ret += KEY_SIZE(k);
@@ -70,6 +72,8 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
struct btree_iter iter;
const char *err;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
+
for_each_key(b, k, &iter) {
if (b->ops->is_extents) {
err = "Keys out of order";
@@ -110,9 +114,9 @@ bug:
static void bch_btree_iter_next_check(struct btree_iter *iter)
{
- struct bkey *k = iter->data->k, *next = bkey_next(k);
+ struct bkey *k = iter->heap.data->k, *next = bkey_next(k);
- if (next < iter->data->end &&
+ if (next < iter->heap.data->end &&
bkey_cmp(k, iter->b->ops->is_extents ?
&START_KEY(next) : next) > 0) {
bch_dump_bucket(iter->b);
@@ -885,6 +889,8 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
+
/*
* If k has preceding key, preceding_key_p will be set to address
* of k's preceding key; otherwise preceding_key_p will be set
@@ -1077,27 +1083,34 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
/* Btree iterator */
-typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
- struct btree_iter_set);
+typedef bool (new_btree_iter_cmp_fn)(const void *, const void *, void *);
-static inline bool btree_iter_cmp(struct btree_iter_set l,
- struct btree_iter_set r)
+static inline bool new_btree_iter_cmp(const void *l, const void *r, void __always_unused *args)
{
- return bkey_cmp(l.k, r.k) > 0;
+ const struct btree_iter_set *_l = l;
+ const struct btree_iter_set *_r = r;
+
+ return bkey_cmp(_l->k, _r->k) <= 0;
}
static inline bool btree_iter_end(struct btree_iter *iter)
{
- return !iter->used;
+ return !iter->heap.nr;
}
void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
struct bkey *end)
{
+ const struct min_heap_callbacks callbacks = {
+ .less = new_btree_iter_cmp,
+ .swp = NULL,
+ };
+
if (k != end)
- BUG_ON(!heap_add(iter,
- ((struct btree_iter_set) { k, end }),
- btree_iter_cmp));
+ BUG_ON(!min_heap_push(&iter->heap,
+ &((struct btree_iter_set) { k, end }),
+ &callbacks,
+ NULL));
}
static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
@@ -1107,8 +1120,8 @@ static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
{
struct bkey *ret = NULL;
- iter->size = ARRAY_SIZE(iter->data);
- iter->used = 0;
+ iter->heap.size = ARRAY_SIZE(iter->heap.preallocated);
+ iter->heap.nr = 0;
#ifdef CONFIG_BCACHE_DEBUG
iter->b = b;
@@ -1130,26 +1143,34 @@ struct bkey *bch_btree_iter_init(struct btree_keys *b,
}
static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
- btree_iter_cmp_fn *cmp)
+ new_btree_iter_cmp_fn *cmp)
{
struct btree_iter_set b __maybe_unused;
struct bkey *ret = NULL;
+ const struct min_heap_callbacks callbacks = {
+ .less = cmp,
+ .swp = NULL,
+ };
if (!btree_iter_end(iter)) {
bch_btree_iter_next_check(iter);
- ret = iter->data->k;
- iter->data->k = bkey_next(iter->data->k);
+ ret = iter->heap.data->k;
+ iter->heap.data->k = bkey_next(iter->heap.data->k);
- if (iter->data->k > iter->data->end) {
+ if (iter->heap.data->k > iter->heap.data->end) {
WARN_ONCE(1, "bset was corrupt!\n");
- iter->data->k = iter->data->end;
+ iter->heap.data->k = iter->heap.data->end;
}
- if (iter->data->k == iter->data->end)
- heap_pop(iter, b, cmp);
+ if (iter->heap.data->k == iter->heap.data->end) {
+ if (iter->heap.nr) {
+ b = min_heap_peek(&iter->heap)[0];
+ min_heap_pop(&iter->heap, &callbacks, NULL);
+ }
+ }
else
- heap_sift(iter, 0, cmp);
+ min_heap_sift_down(&iter->heap, 0, &callbacks, NULL);
}
return ret;
@@ -1157,7 +1178,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
struct bkey *bch_btree_iter_next(struct btree_iter *iter)
{
- return __bch_btree_iter_next(iter, btree_iter_cmp);
+ return __bch_btree_iter_next(iter, new_btree_iter_cmp);
}
@@ -1195,16 +1216,18 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out,
struct btree_iter *iter,
bool fixup, bool remove_stale)
{
- int i;
struct bkey *k, *last = NULL;
BKEY_PADDED(k) tmp;
bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
? bch_ptr_bad
: bch_ptr_invalid;
+ const struct min_heap_callbacks callbacks = {
+ .less = b->ops->sort_cmp,
+ .swp = NULL,
+ };
/* Heapify the iterator, using our comparison function */
- for (i = iter->used / 2 - 1; i >= 0; --i)
- heap_sift(iter, i, b->ops->sort_cmp);
+ min_heapify_all(&iter->heap, &callbacks, NULL);
while (!btree_iter_end(iter)) {
if (b->ops->sort_fixup && fixup)
@@ -1296,6 +1319,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
struct btree_iter iter;
int oldsize = bch_count_data(b);
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
__bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
if (start) {
@@ -1325,6 +1349,8 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
uint64_t start_time = local_clock();
struct btree_iter iter;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
+
bch_btree_iter_init(b, &iter, NULL);
btree_mergesort(b, new->set->data, &iter, false, true);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index d795c84246b0..f79441acd4c1 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -187,8 +187,9 @@ struct bset_tree {
};
struct btree_keys_ops {
- bool (*sort_cmp)(struct btree_iter_set l,
- struct btree_iter_set r);
+ bool (*sort_cmp)(const void *l,
+ const void *r,
+ void *args);
struct bkey *(*sort_fixup)(struct btree_iter *iter,
struct bkey *tmp);
bool (*insert_fixup)(struct btree_keys *b,
@@ -312,16 +313,17 @@ enum {
BTREE_INSERT_STATUS_FRONT_MERGE,
};
+struct btree_iter_set {
+ struct bkey *k, *end;
+};
+
/* Btree key iteration */
struct btree_iter {
- size_t size, used;
#ifdef CONFIG_BCACHE_DEBUG
struct btree_keys *b;
#endif
- struct btree_iter_set {
- struct bkey *k, *end;
- } data[MAX_BSETS];
+ MIN_HEAP_PREALLOCATED(struct btree_iter_set, btree_iter_heap, MAX_BSETS) heap;
};
typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 196cdacce38f..ed40d8600656 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -149,19 +149,19 @@ void bch_btree_node_read_done(struct btree *b)
{
const char *err = "bad btree header";
struct bset *i = btree_bset_first(b);
- struct btree_iter *iter;
+ struct btree_iter iter;
/*
* c->fill_iter can allocate an iterator with more memory space
* than static MAX_BSETS.
* See the comment arount cache_set->fill_iter.
*/
- iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
- iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
- iter->used = 0;
+ iter.heap.data = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
+ iter.heap.size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
+ iter.heap.nr = 0;
#ifdef CONFIG_BCACHE_DEBUG
- iter->b = &b->keys;
+ iter.b = &b->keys;
#endif
if (!i->seq)
@@ -199,7 +199,7 @@ void bch_btree_node_read_done(struct btree *b)
if (i != b->keys.set[0].data && !i->keys)
goto err;
- bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
+ bch_btree_iter_push(&iter, i->start, bset_bkey_last(i));
b->written += set_blocks(i, block_bytes(b->c->cache));
}
@@ -211,7 +211,7 @@ void bch_btree_node_read_done(struct btree *b)
if (i->seq == b->keys.set[0].data->seq)
goto err;
- bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
+ bch_btree_sort_and_fix_extents(&b->keys, &iter, &b->c->sort);
i = b->keys.set[0].data;
err = "short btree key";
@@ -223,7 +223,7 @@ void bch_btree_node_read_done(struct btree *b)
bch_bset_init_next(&b->keys, write_block(b),
bset_magic(&b->c->cache->sb));
out:
- mempool_free(iter, &b->c->fill_iter);
+ mempool_free(iter.heap.data, &b->c->fill_iter);
return;
err:
set_btree_node_io_error(b);
@@ -1312,6 +1312,8 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
struct btree_iter iter;
struct bset_tree *t;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
+
gc->nodes++;
for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
@@ -1573,6 +1575,8 @@ static unsigned int btree_gc_count_keys(struct btree *b)
struct btree_iter iter;
unsigned int ret = 0;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
+
for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
ret += bkey_u64s(k);
@@ -1615,6 +1619,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct gc_merge_info r[GC_MERGE_NODES];
struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
for (i = r; i < r + ARRAY_SIZE(r); i++)
@@ -1740,18 +1745,20 @@ static void btree_gc_start(struct cache_set *c)
mutex_lock(&c->bucket_lock);
- c->gc_mark_valid = 0;
c->gc_done = ZERO_KEY;
ca = c->cache;
for_each_bucket(b, ca) {
b->last_gc = b->gen;
+ if (bch_can_invalidate_bucket(ca, b))
+ b->reclaimable_in_gc = 1;
if (!atomic_read(&b->pin)) {
SET_GC_MARK(b, 0);
SET_GC_SECTORS_USED(b, 0);
}
}
+ c->gc_mark_valid = 0;
mutex_unlock(&c->bucket_lock);
}
@@ -1808,6 +1815,9 @@ static void bch_btree_gc_finish(struct cache_set *c)
for_each_bucket(b, ca) {
c->need_gc = max(c->need_gc, bucket_gc_gen(b));
+ if (b->reclaimable_in_gc)
+ b->reclaimable_in_gc = 0;
+
if (atomic_read(&b->pin))
continue;
@@ -1913,6 +1923,8 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
struct bkey *k, *p = NULL;
struct btree_iter iter;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
+
for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
bch_initial_mark_key(b->c, b->level, k);
@@ -1958,6 +1970,8 @@ static int bch_btree_check_thread(void *arg)
cur_idx = prev_idx = 0;
ret = 0;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
+
/* root node keys are checked before thread created */
bch_btree_iter_init(&c->root->keys, &iter, NULL);
k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
@@ -2054,6 +2068,8 @@ int bch_btree_check(struct cache_set *c)
struct btree_iter iter;
struct btree_check_state check_state;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
+
/* check and mark root node keys */
for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
bch_initial_mark_key(c, c->root->level, k);
@@ -2549,6 +2565,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
struct bkey *k;
struct btree_iter iter;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
bch_btree_iter_init(&b->keys, &iter, from);
while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
@@ -2582,6 +2599,7 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
struct bkey *k;
struct btree_iter iter;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
bch_btree_iter_init(&b->keys, &iter, from);
while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index d626ffcbecb9..4b84fda1530a 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -33,15 +33,16 @@ static void sort_key_next(struct btree_iter *iter,
i->k = bkey_next(i->k);
if (i->k == i->end)
- *i = iter->data[--iter->used];
+ *i = iter->heap.data[--iter->heap.nr];
}
-static bool bch_key_sort_cmp(struct btree_iter_set l,
- struct btree_iter_set r)
+static bool new_bch_key_sort_cmp(const void *l, const void *r, void *args)
{
- int64_t c = bkey_cmp(l.k, r.k);
+ struct btree_iter_set *_l = (struct btree_iter_set *)l;
+ struct btree_iter_set *_r = (struct btree_iter_set *)r;
+ int64_t c = bkey_cmp(_l->k, _r->k);
- return c ? c > 0 : l.k < r.k;
+ return !(c ? c > 0 : _l->k < _r->k);
}
static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
@@ -238,7 +239,7 @@ static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
}
const struct btree_keys_ops bch_btree_keys_ops = {
- .sort_cmp = bch_key_sort_cmp,
+ .sort_cmp = new_bch_key_sort_cmp,
.insert_fixup = bch_btree_ptr_insert_fixup,
.key_invalid = bch_btree_ptr_invalid,
.key_bad = bch_btree_ptr_bad,
@@ -255,22 +256,28 @@ const struct btree_keys_ops bch_btree_keys_ops = {
* Necessary for btree_sort_fixup() - if there are multiple keys that compare
* equal in different sets, we have to process them newest to oldest.
*/
-static bool bch_extent_sort_cmp(struct btree_iter_set l,
- struct btree_iter_set r)
+
+static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_unused *args)
{
- int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+ struct btree_iter_set *_l = (struct btree_iter_set *)l;
+ struct btree_iter_set *_r = (struct btree_iter_set *)r;
+ int64_t c = bkey_cmp(&START_KEY(_l->k), &START_KEY(_r->k));
- return c ? c > 0 : l.k < r.k;
+ return !(c ? c > 0 : _l->k < _r->k);
}
static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
struct bkey *tmp)
{
- while (iter->used > 1) {
- struct btree_iter_set *top = iter->data, *i = top + 1;
-
- if (iter->used > 2 &&
- bch_extent_sort_cmp(i[0], i[1]))
+ const struct min_heap_callbacks callbacks = {
+ .less = new_bch_extent_sort_cmp,
+ .swp = NULL,
+ };
+ while (iter->heap.nr > 1) {
+ struct btree_iter_set *top = iter->heap.data, *i = top + 1;
+
+ if (iter->heap.nr > 2 &&
+ !new_bch_extent_sort_cmp(&i[0], &i[1], NULL))
i++;
if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
@@ -278,7 +285,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
if (!KEY_SIZE(i->k)) {
sort_key_next(iter, i);
- heap_sift(iter, i - top, bch_extent_sort_cmp);
+ min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL);
continue;
}
@@ -288,7 +295,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
else
bch_cut_front(top->k, i->k);
- heap_sift(iter, i - top, bch_extent_sort_cmp);
+ min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL);
} else {
/* can't happen because of comparison func */
BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
@@ -298,7 +305,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
bch_cut_back(&START_KEY(i->k), tmp);
bch_cut_front(i->k, top->k);
- heap_sift(iter, 0, bch_extent_sort_cmp);
+ min_heap_sift_down(&iter->heap, 0, &callbacks, NULL);
return tmp;
} else {
@@ -618,7 +625,7 @@ static bool bch_extent_merge(struct btree_keys *bk,
}
const struct btree_keys_ops bch_extent_keys_ops = {
- .sort_cmp = bch_extent_sort_cmp,
+ .sort_cmp = new_bch_extent_sort_cmp,
.sort_fixup = bch_extent_sort_fixup,
.insert_fixup = bch_extent_insert_fixup,
.key_invalid = bch_extent_invalid,
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index ebd500bdf0b2..45ca134cbf02 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -82,7 +82,7 @@ static void moving_init(struct moving_io *io)
bio_init(bio, NULL, bio->bi_inline_vecs,
DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0);
bio_get(bio);
- bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9;
bio->bi_private = &io->cl;
@@ -182,16 +182,19 @@ err: if (!IS_ERR_OR_NULL(w->private))
closure_sync(&cl);
}
-static bool bucket_cmp(struct bucket *l, struct bucket *r)
+static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *args)
{
- return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
+ struct bucket **_l = (struct bucket **)l;
+ struct bucket **_r = (struct bucket **)r;
+
+ return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r);
}
static unsigned int bucket_heap_top(struct cache *ca)
{
struct bucket *b;
- return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
+ return (b = min_heap_peek(&ca->heap)[0]) ? GC_SECTORS_USED(b) : 0;
}
void bch_moving_gc(struct cache_set *c)
@@ -199,6 +202,10 @@ void bch_moving_gc(struct cache_set *c)
struct cache *ca = c->cache;
struct bucket *b;
unsigned long sectors_to_move, reserve_sectors;
+ const struct min_heap_callbacks callbacks = {
+ .less = new_bucket_cmp,
+ .swp = NULL,
+ };
if (!c->copy_gc_enabled)
return;
@@ -209,7 +216,7 @@ void bch_moving_gc(struct cache_set *c)
reserve_sectors = ca->sb.bucket_size *
fifo_used(&ca->free[RESERVE_MOVINGGC]);
- ca->heap.used = 0;
+ ca->heap.nr = 0;
for_each_bucket(b, ca) {
if (GC_MARK(b) == GC_MARK_METADATA ||
@@ -218,25 +225,31 @@ void bch_moving_gc(struct cache_set *c)
atomic_read(&b->pin))
continue;
- if (!heap_full(&ca->heap)) {
+ if (!min_heap_full(&ca->heap)) {
sectors_to_move += GC_SECTORS_USED(b);
- heap_add(&ca->heap, b, bucket_cmp);
- } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
+ min_heap_push(&ca->heap, &b, &callbacks, NULL);
+ } else if (!new_bucket_cmp(&b, min_heap_peek(&ca->heap), ca)) {
sectors_to_move -= bucket_heap_top(ca);
sectors_to_move += GC_SECTORS_USED(b);
ca->heap.data[0] = b;
- heap_sift(&ca->heap, 0, bucket_cmp);
+ min_heap_sift_down(&ca->heap, 0, &callbacks, NULL);
}
}
while (sectors_to_move > reserve_sectors) {
- heap_pop(&ca->heap, b, bucket_cmp);
+ if (ca->heap.nr) {
+ b = min_heap_peek(&ca->heap)[0];
+ min_heap_pop(&ca->heap, &callbacks, NULL);
+ }
sectors_to_move -= GC_SECTORS_USED(b);
}
- while (heap_pop(&ca->heap, b, bucket_cmp))
+ while (ca->heap.nr) {
+ b = min_heap_peek(&ca->heap)[0];
+ min_heap_pop(&ca->heap, &callbacks, NULL);
SET_GC_MOVE(b, 1);
+ }
mutex_unlock(&c->bucket_lock);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 83d112bd2b1c..af345dc6fde1 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -369,10 +369,24 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
struct io *i;
if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
- c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
(bio_op(bio) == REQ_OP_DISCARD))
goto skip;
+ if (c->gc_stats.in_use > CUTOFF_CACHE_ADD) {
+ /*
+ * If cached buckets are all clean now, 'true' will be
+ * returned and all requests will bypass the cache device.
+ * Then c->sectors_to_gc has no chance to be negative, and
+ * gc thread won't wake up and caching won't work forever.
+ * Here call force_wake_up_gc() to avoid such aftermath.
+ */
+ if (BDEV_STATE(&dc->sb) == BDEV_STATE_CLEAN &&
+ c->gc_mark_valid)
+ force_wake_up_gc(c);
+
+ goto skip;
+ }
+
if (mode == CACHE_MODE_NONE ||
(mode == CACHE_MODE_WRITEAROUND &&
op_is_write(bio_op(bio))))
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 330bcd9ea4a9..e42f1400cea9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -171,7 +171,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
struct page *page;
unsigned int i;
- page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+ page = read_cache_page_gfp(bdev->bd_mapping,
SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
if (IS_ERR(page))
return "IO error";
@@ -881,8 +881,8 @@ static void bcache_device_free(struct bcache_device *d)
bcache_device_detach(d);
if (disk) {
- ida_simple_remove(&bcache_device_idx,
- first_minor_to_idx(disk->first_minor));
+ ida_free(&bcache_device_idx,
+ first_minor_to_idx(disk->first_minor));
put_disk(disk);
}
@@ -897,7 +897,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
sector_t sectors, struct block_device *cached_bdev,
const struct block_device_operations *ops)
{
- struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX,
SIZE_MAX / sizeof(atomic_t));
struct queue_limits lim = {
@@ -909,6 +908,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
.io_min = block_size,
.logical_block_size = block_size,
.physical_block_size = block_size,
+ .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA,
};
uint64_t n;
int idx;
@@ -940,8 +940,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
if (!d->full_dirty_stripes)
goto out_free_stripe_sectors_dirty;
- idx = ida_simple_get(&bcache_device_idx, 0,
- BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
+ idx = ida_alloc_max(&bcache_device_idx, BCACHE_DEVICE_IDX_MAX - 1,
+ GFP_KERNEL);
if (idx < 0)
goto out_free_full_dirty_stripes;
@@ -974,19 +974,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
d->disk->minors = BCACHE_MINORS;
d->disk->fops = ops;
d->disk->private_data = d;
-
- q = d->disk->queue;
-
- blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
-
- blk_queue_write_cache(q, true, true);
-
return 0;
out_bioset_exit:
bioset_exit(&d->bio_split);
out_ida_remove:
- ida_simple_remove(&bcache_device_idx, idx);
+ ida_free(&bcache_device_idx, idx);
out_free_full_dirty_stripes:
kvfree(d->full_dirty_stripes);
out_free_stripe_sectors_dirty:
@@ -1423,8 +1416,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
}
if (bdev_io_opt(dc->bdev))
- dc->partial_stripes_expensive =
- q->limits.raid_partial_stripes_expensive;
+ dc->partial_stripes_expensive = !!(q->limits.features &
+ BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE);
ret = bcache_device_init(&dc->disk, block_size,
bdev_nr_sectors(dc->bdev) - dc->sb.data_offset,
@@ -1725,7 +1718,7 @@ static CLOSURE_CALLBACK(cache_set_flush)
if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
- if (!IS_ERR(c->root))
+ if (!IS_ERR_OR_NULL(c->root))
list_add(&c->root->list, &c->btree_cache);
/*
@@ -1914,8 +1907,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->data_buckets);
- iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
- sizeof(struct btree_iter_set);
+ iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) *
+ sizeof(struct btree_iter_set);
c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
if (!c->devices)
@@ -2554,10 +2547,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (IS_ERR(bdev_file))
goto out_free_sb;
- err = "failed to set blocksize";
- if (set_blocksize(file_bdev(bdev_file), 4096))
- goto out_blkdev_put;
-
err = read_super(sb, file_bdev(bdev_file), &sb_disk);
if (err)
goto out_blkdev_put;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 6956beb55326..e8f696cb58c0 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -662,6 +662,8 @@ static unsigned int bch_root_usage(struct cache_set *c)
struct btree *b;
struct btree_iter iter;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
+
goto lock_root;
do {
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index ae380bc3992e..410d8cb49e50 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * random utiility code, for bcache but in theory not specific to bcache
+ * random utility code, for bcache but in theory not specific to bcache
*
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
* Copyright 2012 Google, Inc.
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index f61ab1bada6c..539454d8e2d0 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/sched/clock.h>
#include <linux/llist.h>
+#include <linux/min_heap.h>
#include <linux/ratelimit.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
@@ -30,16 +31,10 @@ struct closure;
#endif
-#define DECLARE_HEAP(type, name) \
- struct { \
- size_t size, used; \
- type *data; \
- } name
-
#define init_heap(heap, _size, gfp) \
({ \
size_t _bytes; \
- (heap)->used = 0; \
+ (heap)->nr = 0; \
(heap)->size = (_size); \
_bytes = (heap)->size * sizeof(*(heap)->data); \
(heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \
@@ -52,64 +47,6 @@ do { \
(heap)->data = NULL; \
} while (0)
-#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j])
-
-#define heap_sift(h, i, cmp) \
-do { \
- size_t _r, _j = i; \
- \
- for (; _j * 2 + 1 < (h)->used; _j = _r) { \
- _r = _j * 2 + 1; \
- if (_r + 1 < (h)->used && \
- cmp((h)->data[_r], (h)->data[_r + 1])) \
- _r++; \
- \
- if (cmp((h)->data[_r], (h)->data[_j])) \
- break; \
- heap_swap(h, _r, _j); \
- } \
-} while (0)
-
-#define heap_sift_down(h, i, cmp) \
-do { \
- while (i) { \
- size_t p = (i - 1) / 2; \
- if (cmp((h)->data[i], (h)->data[p])) \
- break; \
- heap_swap(h, i, p); \
- i = p; \
- } \
-} while (0)
-
-#define heap_add(h, d, cmp) \
-({ \
- bool _r = !heap_full(h); \
- if (_r) { \
- size_t _i = (h)->used++; \
- (h)->data[_i] = d; \
- \
- heap_sift_down(h, _i, cmp); \
- heap_sift(h, _i, cmp); \
- } \
- _r; \
-})
-
-#define heap_pop(h, d, cmp) \
-({ \
- bool _r = (h)->used; \
- if (_r) { \
- (d) = (h)->data[0]; \
- (h)->used--; \
- heap_swap(h, 0, (h)->used); \
- heap_sift(h, 0, cmp); \
- } \
- _r; \
-})
-
-#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL)
-
-#define heap_full(h) ((h)->used == (h)->size)
-
#define DECLARE_FIFO(type, name) \
struct { \
size_t front, back, size, mask; \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 8827a6f130ad..453efbbdc8ee 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -334,7 +334,7 @@ static void dirty_init(struct keybuf_key *w)
bio_init(bio, NULL, bio->bi_inline_vecs,
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
if (!io->dc->writeback_percent)
- bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9;
bio->bi_private = w;
@@ -915,6 +915,7 @@ static int bch_dirty_init_thread(void *arg)
k = p = NULL;
prev_idx = 0;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
bch_btree_iter_init(&c->root->keys, &iter, NULL);
k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
BUG_ON(!k);
@@ -984,6 +985,8 @@ void bch_sectors_dirty_init(struct bcache_device *d)
struct cache_set *c = d->c;
struct bch_dirty_init_state state;
+ min_heap_init(&iter.heap, NULL, MAX_BSETS);
+
retry_lock:
b = c->root;
rw_lock(0, b, b->level);
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index bca0f39e15b8..b4d1c4329df3 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -198,15 +198,6 @@ int dm_bio_detain(struct dm_bio_prison *prison,
}
EXPORT_SYMBOL_GPL(dm_bio_detain);
-int dm_get_cell(struct dm_bio_prison *prison,
- struct dm_cell_key *key,
- struct dm_bio_prison_cell *cell_prealloc,
- struct dm_bio_prison_cell **cell_result)
-{
- return bio_detain(prison, key, NULL, cell_prealloc, cell_result);
-}
-EXPORT_SYMBOL_GPL(dm_get_cell);
-
/*
* @inmates must have been initialised prior to this call
*/
@@ -288,32 +279,6 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
}
EXPORT_SYMBOL_GPL(dm_cell_visit_release);
-static int __promote_or_release(struct rb_root *root,
- struct dm_bio_prison_cell *cell)
-{
- if (bio_list_empty(&cell->bios)) {
- rb_erase(&cell->node, root);
- return 1;
- }
-
- cell->holder = bio_list_pop(&cell->bios);
- return 0;
-}
-
-int dm_cell_promote_or_release(struct dm_bio_prison *prison,
- struct dm_bio_prison_cell *cell)
-{
- int r;
- unsigned l = lock_nr(&cell->key, prison->num_locks);
-
- spin_lock_irq(&prison->regions[l].lock);
- r = __promote_or_release(&prison->regions[l].cell, cell);
- spin_unlock_irq(&prison->regions[l].lock);
-
- return r;
-}
-EXPORT_SYMBOL_GPL(dm_cell_promote_or_release);
-
/*----------------------------------------------------------------*/
#define DEFERRED_SET_SIZE 64
diff --git a/drivers/md/dm-bio-prison-v1.h b/drivers/md/dm-bio-prison-v1.h
index 2a097ed0d85e..d39706c48447 100644
--- a/drivers/md/dm-bio-prison-v1.h
+++ b/drivers/md/dm-bio-prison-v1.h
@@ -73,17 +73,6 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell);
/*
- * Creates, or retrieves a cell that overlaps the given key.
- *
- * Returns 1 if pre-existing cell returned, zero if new cell created using
- * @cell_prealloc.
- */
-int dm_get_cell(struct dm_bio_prison *prison,
- struct dm_cell_key *key,
- struct dm_bio_prison_cell *cell_prealloc,
- struct dm_bio_prison_cell **cell_result);
-
-/*
* Returns false if key is beyond BIO_PRISON_MAX_RANGE or spans a boundary.
*/
bool dm_cell_key_has_valid_range(struct dm_cell_key *key);
@@ -117,19 +106,6 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
void (*visit_fn)(void *, struct dm_bio_prison_cell *),
void *context, struct dm_bio_prison_cell *cell);
-/*
- * Rather than always releasing the prisoners in a cell, the client may
- * want to promote one of them to be the new holder. There is a race here
- * though between releasing an empty cell, and other threads adding new
- * inmates. So this function makes the decision with its lock held.
- *
- * This function can have two outcomes:
- * i) An inmate is promoted to be the holder of the cell (return value of 0).
- * ii) The cell has no inmate for promotion and is released (return value of 1).
- */
-int dm_cell_promote_or_release(struct dm_bio_prison *prison,
- struct dm_bio_prison_cell *cell);
-
/*----------------------------------------------------------------*/
/*
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index fd852981ef9c..cf433b0cf742 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -321,8 +321,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison,
{
BUG_ON(!cell->exclusive_lock);
- bio_list_merge(bios, &cell->bios);
- bio_list_init(&cell->bios);
+ bio_list_merge_init(bios, &cell->bios);
if (cell->shared_count) {
cell->exclusive_lock = false;
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 098bf526136c..aab8240429b0 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -318,9 +318,10 @@ static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *con
*/
enum data_mode {
DATA_MODE_SLAB = 0,
- DATA_MODE_GET_FREE_PAGES = 1,
- DATA_MODE_VMALLOC = 2,
- DATA_MODE_LIMIT = 3
+ DATA_MODE_KMALLOC = 1,
+ DATA_MODE_GET_FREE_PAGES = 2,
+ DATA_MODE_VMALLOC = 3,
+ DATA_MODE_LIMIT = 4
};
struct dm_buffer {
@@ -529,9 +530,6 @@ static struct dm_buffer *list_to_buffer(struct list_head *l)
{
struct lru_entry *le = list_entry(l, struct lru_entry, list);
- if (!le)
- return NULL;
-
return le_to_buffer(le);
}
@@ -1065,6 +1063,7 @@ static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
static unsigned long dm_bufio_peak_allocated;
static unsigned long dm_bufio_allocated_kmem_cache;
+static unsigned long dm_bufio_allocated_kmalloc;
static unsigned long dm_bufio_allocated_get_free_pages;
static unsigned long dm_bufio_allocated_vmalloc;
static unsigned long dm_bufio_current_allocated;
@@ -1107,6 +1106,7 @@ static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
&dm_bufio_allocated_kmem_cache,
+ &dm_bufio_allocated_kmalloc,
&dm_bufio_allocated_get_free_pages,
&dm_bufio_allocated_vmalloc,
};
@@ -1184,6 +1184,11 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
return kmem_cache_alloc(c->slab_cache, gfp_mask);
}
+ if (unlikely(c->block_size < PAGE_SIZE)) {
+ *data_mode = DATA_MODE_KMALLOC;
+ return kmalloc(c->block_size, gfp_mask | __GFP_RECLAIMABLE);
+ }
+
if (c->block_size <= KMALLOC_MAX_SIZE &&
gfp_mask & __GFP_NORETRY) {
*data_mode = DATA_MODE_GET_FREE_PAGES;
@@ -1207,6 +1212,10 @@ static void free_buffer_data(struct dm_bufio_client *c,
kmem_cache_free(c->slab_cache, data);
break;
+ case DATA_MODE_KMALLOC:
+ kfree(data);
+ break;
+
case DATA_MODE_GET_FREE_PAGES:
free_pages((unsigned long)data,
c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
@@ -2474,7 +2483,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
int r;
unsigned int num_locks;
struct dm_bufio_client *c;
- char slab_name[27];
+ char slab_name[64];
+ static atomic_t seqno = ATOMIC_INIT(0);
if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
DMERR("%s: block size not specified or is not multiple of 512b", __func__);
@@ -2521,11 +2531,11 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
goto bad_dm_io;
}
- if (block_size <= KMALLOC_MAX_SIZE &&
- (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
+ if (block_size <= KMALLOC_MAX_SIZE && !is_power_of_2(block_size)) {
unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE);
- snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u", block_size);
+ snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u-%u",
+ block_size, atomic_inc_return(&seqno));
c->slab_cache = kmem_cache_create(slab_name, block_size, align,
SLAB_RECLAIM_ACCOUNT, NULL);
if (!c->slab_cache) {
@@ -2534,9 +2544,11 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
}
}
if (aux_size)
- snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u", aux_size);
+ snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u-%u",
+ aux_size, atomic_inc_return(&seqno));
else
- snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer");
+ snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u",
+ atomic_inc_return(&seqno));
c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
0, SLAB_RECLAIM_ACCOUNT, NULL);
if (!c->slab_buffer) {
@@ -2901,6 +2913,7 @@ static int __init dm_bufio_init(void)
__u64 mem;
dm_bufio_allocated_kmem_cache = 0;
+ dm_bufio_allocated_kmalloc = 0;
dm_bufio_allocated_get_free_pages = 0;
dm_bufio_allocated_vmalloc = 0;
dm_bufio_current_allocated = 0;
@@ -2989,6 +3002,9 @@ MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, 0444);
MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
+module_param_named(allocated_kmalloc_bytes, dm_bufio_allocated_kmalloc, ulong, 0444);
+MODULE_PARM_DESC(allocated_kmalloc_bytes, "Memory allocated with kmalloc_alloc");
+
module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, 0444);
MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
index 9c5308298cf1..b4165f172d62 100644
--- a/drivers/md/dm-cache-background-tracker.c
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -11,12 +11,6 @@
#define DM_MSG_PREFIX "dm-background-tracker"
-struct bt_work {
- struct list_head list;
- struct rb_node node;
- struct policy_work work;
-};
-
struct background_tracker {
unsigned int max_work;
atomic_t pending_promotes;
@@ -26,10 +20,10 @@ struct background_tracker {
struct list_head issued;
struct list_head queued;
struct rb_root pending;
-
- struct kmem_cache *work_cache;
};
+struct kmem_cache *btracker_work_cache = NULL;
+
struct background_tracker *btracker_create(unsigned int max_work)
{
struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
@@ -48,12 +42,6 @@ struct background_tracker *btracker_create(unsigned int max_work)
INIT_LIST_HEAD(&b->queued);
b->pending = RB_ROOT;
- b->work_cache = KMEM_CACHE(bt_work, 0);
- if (!b->work_cache) {
- DMERR("couldn't create mempool for background work items");
- kfree(b);
- b = NULL;
- }
return b;
}
@@ -66,10 +54,9 @@ void btracker_destroy(struct background_tracker *b)
BUG_ON(!list_empty(&b->issued));
list_for_each_entry_safe (w, tmp, &b->queued, list) {
list_del(&w->list);
- kmem_cache_free(b->work_cache, w);
+ kmem_cache_free(btracker_work_cache, w);
}
- kmem_cache_destroy(b->work_cache);
kfree(b);
}
EXPORT_SYMBOL_GPL(btracker_destroy);
@@ -156,12 +143,6 @@ static void update_stats(struct background_tracker *b, struct policy_work *w, in
}
}
-unsigned int btracker_nr_writebacks_queued(struct background_tracker *b)
-{
- return atomic_read(&b->pending_writebacks);
-}
-EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued);
-
unsigned int btracker_nr_demotions_queued(struct background_tracker *b)
{
return atomic_read(&b->pending_demotes);
@@ -180,7 +161,7 @@ static struct bt_work *alloc_work(struct background_tracker *b)
if (max_work_reached(b))
return NULL;
- return kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
+ return kmem_cache_alloc(btracker_work_cache, GFP_NOWAIT);
}
int btracker_queue(struct background_tracker *b,
@@ -203,7 +184,7 @@ int btracker_queue(struct background_tracker *b,
* There was a race, we'll just ignore this second
* bit of work for the same oblock.
*/
- kmem_cache_free(b->work_cache, w);
+ kmem_cache_free(btracker_work_cache, w);
return -EINVAL;
}
@@ -244,7 +225,7 @@ void btracker_complete(struct background_tracker *b,
update_stats(b, &w->work, -1);
rb_erase(&w->node, &b->pending);
list_del(&w->list);
- kmem_cache_free(b->work_cache, w);
+ kmem_cache_free(btracker_work_cache, w);
}
EXPORT_SYMBOL_GPL(btracker_complete);
diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h
index 5b8f5c667b81..47156c14a44a 100644
--- a/drivers/md/dm-cache-background-tracker.h
+++ b/drivers/md/dm-cache-background-tracker.h
@@ -26,6 +26,14 @@
* protected with a spinlock.
*/
+struct bt_work {
+ struct list_head list;
+ struct rb_node node;
+ struct policy_work work;
+};
+
+extern struct kmem_cache *btracker_work_cache;
+
struct background_work;
struct background_tracker;
@@ -42,7 +50,6 @@ struct background_tracker *btracker_create(unsigned int max_work);
*/
void btracker_destroy(struct background_tracker *b);
-unsigned int btracker_nr_writebacks_queued(struct background_tracker *b);
unsigned int btracker_nr_demotions_queued(struct background_tracker *b);
/*
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 96751cd3d181..a9a1ab284076 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -170,7 +170,7 @@ struct dm_cache_metadata {
*/
#define SUPERBLOCK_CSUM_XOR 9031977
-static void sb_prepare_for_write(struct dm_block_validator *v,
+static void sb_prepare_for_write(const struct dm_block_validator *v,
struct dm_block *b,
size_t sb_block_size)
{
@@ -195,7 +195,7 @@ static int check_metadata_version(struct cache_disk_superblock *disk_super)
return 0;
}
-static int sb_check(struct dm_block_validator *v,
+static int sb_check(const struct dm_block_validator *v,
struct dm_block *b,
size_t sb_block_size)
{
@@ -228,7 +228,7 @@ static int sb_check(struct dm_block_validator *v,
return check_metadata_version(disk_super);
}
-static struct dm_block_validator sb_validator = {
+static const struct dm_block_validator sb_validator = {
.name = "superblock",
.prepare_for_write = sb_prepare_for_write,
.check = sb_check
@@ -1218,15 +1218,6 @@ int dm_cache_load_discards(struct dm_cache_metadata *cmd,
return r;
}
-int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result)
-{
- READ_LOCK(cmd);
- *result = cmd->cache_blocks;
- READ_UNLOCK(cmd);
-
- return 0;
-}
-
static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
{
int r;
@@ -1282,15 +1273,6 @@ int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
return r;
}
-struct thunk {
- load_mapping_fn fn;
- void *context;
-
- struct dm_cache_metadata *cmd;
- bool respect_dirty_flags;
- bool hints_valid;
-};
-
static bool policy_unchanged(struct dm_cache_metadata *cmd,
struct dm_cache_policy *policy)
{
@@ -1516,30 +1498,6 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
return r;
}
-static int __dump_mapping(void *context, uint64_t cblock, void *leaf)
-{
- __le64 value;
- dm_oblock_t oblock;
- unsigned int flags;
-
- memcpy(&value, leaf, sizeof(value));
- unpack_value(value, &oblock, &flags);
-
- return 0;
-}
-
-static int __dump_mappings(struct dm_cache_metadata *cmd)
-{
- return dm_array_walk(&cmd->info, cmd->root, __dump_mapping, NULL);
-}
-
-void dm_cache_dump(struct dm_cache_metadata *cmd)
-{
- READ_LOCK_VOID(cmd);
- __dump_mappings(cmd);
- READ_UNLOCK(cmd);
-}
-
int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd)
{
int r;
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 57afc7047947..5f77890207fe 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -71,7 +71,6 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
* origin blocks to map to.
*/
int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size);
-int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result);
int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
sector_t discard_block_size,
@@ -123,8 +122,6 @@ int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
dm_block_t *result);
-void dm_cache_dump(struct dm_cache_metadata *cmd);
-
/*
* The policy is invited to save a 32bit hint value for every cblock (eg,
* for a hit count). These are stored against the policy name. If
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 911f73f7ebba..9cb797a561d6 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -10,6 +10,7 @@
#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
#include "dm-io-tracker.h"
+#include "dm-cache-background-tracker.h"
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
@@ -115,8 +116,7 @@ static void __commit(struct work_struct *_ws)
*/
spin_lock_irq(&b->lock);
list_splice_init(&b->work_items, &work_items);
- bio_list_merge(&bios, &b->bios);
- bio_list_init(&b->bios);
+ bio_list_merge_init(&bios, &b->bios);
b->commit_scheduled = false;
spin_unlock_irq(&b->lock);
@@ -565,8 +565,7 @@ static void defer_bio(struct cache *cache, struct bio *bio)
static void defer_bios(struct cache *cache, struct bio_list *bios)
{
spin_lock_irq(&cache->lock);
- bio_list_merge(&cache->deferred_bios, bios);
- bio_list_init(bios);
+ bio_list_merge_init(&cache->deferred_bios, bios);
spin_unlock_irq(&cache->lock);
wake_deferred_bio_worker(cache);
@@ -1370,7 +1369,7 @@ static void mg_copy(struct work_struct *ws)
*/
bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
- BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
+ BUG_ON(rb); /* An exclusive lock must _not_ be held for this block */
mg->overwrite_bio = NULL;
inc_io_migrations(mg->cache);
mg_full_copy(ws);
@@ -1816,8 +1815,7 @@ static void process_deferred_bios(struct work_struct *ws)
bio_list_init(&bios);
spin_lock_irq(&cache->lock);
- bio_list_merge(&bios, &cache->deferred_bios);
- bio_list_init(&cache->deferred_bios);
+ bio_list_merge_init(&bios, &cache->deferred_bios);
spin_unlock_irq(&cache->lock);
while ((bio = bio_list_pop(&bios))) {
@@ -1847,8 +1845,7 @@ static void requeue_deferred_bios(struct cache *cache)
struct bio_list bios;
bio_list_init(&bios);
- bio_list_merge(&bios, &cache->deferred_bios);
- bio_list_init(&cache->deferred_bios);
+ bio_list_merge_init(&bios, &cache->deferred_bios);
while ((bio = bio_list_pop(&bios))) {
bio->bi_status = BLK_STS_DM_REQUEUE;
@@ -1909,16 +1906,13 @@ static void check_migrations(struct work_struct *ws)
* This function gets called on the error paths of the constructor, so we
* have to cope with a partially initialised struct.
*/
-static void destroy(struct cache *cache)
+static void __destroy(struct cache *cache)
{
- unsigned int i;
-
mempool_exit(&cache->migration_pool);
if (cache->prison)
dm_bio_prison_destroy_v2(cache->prison);
- cancel_delayed_work_sync(&cache->waker);
if (cache->wq)
destroy_workqueue(cache->wq);
@@ -1946,13 +1940,22 @@ static void destroy(struct cache *cache)
if (cache->policy)
dm_cache_policy_destroy(cache->policy);
+ bioset_exit(&cache->bs);
+
+ kfree(cache);
+}
+
+static void destroy(struct cache *cache)
+{
+ unsigned int i;
+
+ cancel_delayed_work_sync(&cache->waker);
+
for (i = 0; i < cache->nr_ctr_args ; i++)
kfree(cache->ctr_args[i]);
kfree(cache->ctr_args);
- bioset_exit(&cache->bs);
-
- kfree(cache);
+ __destroy(cache);
}
static void cache_dtr(struct dm_target *ti)
@@ -2007,7 +2010,6 @@ struct cache_args {
sector_t cache_sectors;
struct dm_dev *origin_dev;
- sector_t origin_sectors;
uint32_t block_size;
@@ -2088,6 +2090,7 @@ static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
char **error)
{
+ sector_t origin_sectors;
int r;
if (!at_least_one_arg(as, error))
@@ -2100,8 +2103,8 @@ static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
return r;
}
- ca->origin_sectors = get_dev_size(ca->origin_dev);
- if (ca->ti->len > ca->origin_sectors) {
+ origin_sectors = get_dev_size(ca->origin_dev);
+ if (ca->ti->len > origin_sectors) {
*error = "Device size larger than cached device";
return -EINVAL;
}
@@ -2261,7 +2264,7 @@ static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
/*----------------------------------------------------------------*/
-static struct kmem_cache *migration_cache;
+static struct kmem_cache *migration_cache = NULL;
#define NOT_CORE_OPTION 1
@@ -2411,7 +2414,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
- origin_blocks = cache->origin_sectors = ca->origin_sectors;
+ origin_blocks = cache->origin_sectors = ti->len;
origin_blocks = block_div(origin_blocks, ca->block_size);
cache->origin_blocks = to_oblock(origin_blocks);
@@ -2565,7 +2568,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
*result = cache;
return 0;
bad:
- destroy(cache);
+ __destroy(cache);
return r;
}
@@ -2616,7 +2619,7 @@ static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
if (r) {
- destroy(cache);
+ __destroy(cache);
goto out;
}
@@ -2899,19 +2902,19 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache)
static bool can_resize(struct cache *cache, dm_cblock_t new_size)
{
if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
- if (cache->sized) {
- DMERR("%s: unable to extend cache due to missing cache table reload",
- cache_device_name(cache));
- return false;
- }
+ DMERR("%s: unable to extend cache due to missing cache table reload",
+ cache_device_name(cache));
+ return false;
}
/*
* We can't drop a dirty block when shrinking the cache.
*/
- while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
- new_size = to_cblock(from_cblock(new_size) + 1);
- if (is_dirty(cache, new_size)) {
+ if (cache->loaded_mappings) {
+ new_size = to_cblock(find_next_bit(cache->dirty_bitset,
+ from_cblock(cache->cache_size),
+ from_cblock(new_size)));
+ if (new_size != cache->cache_size) {
DMERR("%s: unable to shrink cache; cache block %llu is dirty",
cache_device_name(cache),
(unsigned long long) from_cblock(new_size));
@@ -2947,20 +2950,15 @@ static int cache_preresume(struct dm_target *ti)
/*
* Check to see if the cache has resized.
*/
- if (!cache->sized) {
- r = resize_cache_dev(cache, csize);
- if (r)
- return r;
-
- cache->sized = true;
-
- } else if (csize != cache->cache_size) {
+ if (!cache->sized || csize != cache->cache_size) {
if (!can_resize(cache, csize))
return -EINVAL;
r = resize_cache_dev(cache, csize);
if (r)
return r;
+
+ cache->sized = true;
}
if (!cache->loaded_mappings) {
@@ -3204,8 +3202,6 @@ static int parse_cblock_range(struct cache *cache, const char *str,
* Try and parse form (ii) first.
*/
r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
- if (r < 0)
- return r;
if (r == 2) {
result->begin = to_cblock(b);
@@ -3217,8 +3213,6 @@ static int parse_cblock_range(struct cache *cache, const char *str,
* That didn't work, try form (i).
*/
r = sscanf(str, "%llu%c", &b, &dummy);
- if (r < 0)
- return r;
if (r == 1) {
result->begin = to_cblock(b);
@@ -3368,7 +3362,7 @@ static int cache_iterate_devices(struct dm_target *ti,
static void disable_passdown_if_not_supported(struct cache *cache)
{
struct block_device *origin_bdev = cache->origin_dev->bdev;
- struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
+ struct queue_limits *origin_limits = bdev_limits(origin_bdev);
const char *reason = NULL;
if (!cache->features.discard_passdown)
@@ -3390,12 +3384,12 @@ static void disable_passdown_if_not_supported(struct cache *cache)
static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
{
struct block_device *origin_bdev = cache->origin_dev->bdev;
- struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
+ struct queue_limits *origin_limits = bdev_limits(origin_bdev);
if (!cache->features.discard_passdown) {
/* No passdown is done so setting own virtual limits */
- limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
- cache->origin_sectors);
+ limits->max_hw_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
+ cache->origin_sectors);
limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
return;
}
@@ -3404,11 +3398,9 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
* cache_iterate_devices() is stacking both origin and fast device limits
* but discards aren't passed to fast device, so inherit origin's limits.
*/
- limits->max_discard_sectors = origin_limits->max_discard_sectors;
limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors;
limits->discard_granularity = origin_limits->discard_granularity;
limits->discard_alignment = origin_limits->discard_alignment;
- limits->discard_misaligned = origin_limits->discard_misaligned;
}
static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -3422,8 +3414,8 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
*/
if (io_opt_sectors < cache->sectors_per_block ||
do_div(io_opt_sectors, cache->sectors_per_block)) {
- blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
- blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
+ limits->io_min = cache->sectors_per_block << SECTOR_SHIFT;
+ limits->io_opt = cache->sectors_per_block << SECTOR_SHIFT;
}
disable_passdown_if_not_supported(cache);
@@ -3454,22 +3446,36 @@ static int __init dm_cache_init(void)
int r;
migration_cache = KMEM_CACHE(dm_cache_migration, 0);
- if (!migration_cache)
- return -ENOMEM;
+ if (!migration_cache) {
+ r = -ENOMEM;
+ goto err;
+ }
+
+ btracker_work_cache = kmem_cache_create("dm_cache_bt_work",
+ sizeof(struct bt_work), __alignof__(struct bt_work), 0, NULL);
+ if (!btracker_work_cache) {
+ r = -ENOMEM;
+ goto err;
+ }
r = dm_register_target(&cache_target);
if (r) {
- kmem_cache_destroy(migration_cache);
- return r;
+ goto err;
}
return 0;
+
+err:
+ kmem_cache_destroy(migration_cache);
+ kmem_cache_destroy(btracker_work_cache);
+ return r;
}
static void __exit dm_cache_exit(void)
{
dm_unregister_target(&cache_target);
kmem_cache_destroy(migration_cache);
+ kmem_cache_destroy(btracker_work_cache);
}
module_init(dm_cache_init);
diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c
index c43d55672bce..14c5c28d938b 100644
--- a/drivers/md/dm-clone-metadata.c
+++ b/drivers/md/dm-clone-metadata.c
@@ -163,7 +163,7 @@ struct dm_clone_metadata {
/*
* Superblock validation.
*/
-static void sb_prepare_for_write(struct dm_block_validator *v,
+static void sb_prepare_for_write(const struct dm_block_validator *v,
struct dm_block *b, size_t sb_block_size)
{
struct superblock_disk *sb;
@@ -177,7 +177,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v,
sb->csum = cpu_to_le32(csum);
}
-static int sb_check(struct dm_block_validator *v, struct dm_block *b,
+static int sb_check(const struct dm_block_validator *v, struct dm_block *b,
size_t sb_block_size)
{
struct superblock_disk *sb;
@@ -220,7 +220,7 @@ static int sb_check(struct dm_block_validator *v, struct dm_block *b,
return 0;
}
-static struct dm_block_validator sb_validator = {
+static const struct dm_block_validator sb_validator = {
.name = "superblock",
.prepare_for_write = sb_prepare_for_write,
.check = sb_check
@@ -465,11 +465,6 @@ static void __destroy_persistent_data_structures(struct dm_clone_metadata *cmd)
/*---------------------------------------------------------------------------*/
-static size_t bitmap_size(unsigned long nr_bits)
-{
- return BITS_TO_LONGS(nr_bits) * sizeof(long);
-}
-
static int __dirty_map_init(struct dirty_map *dmap, unsigned long nr_words,
unsigned long nr_regions)
{
@@ -535,10 +530,7 @@ static int __load_bitset_in_core(struct dm_clone_metadata *cmd)
return r;
for (i = 0; ; i++) {
- if (dm_bitset_cursor_get_value(&c))
- __set_bit(i, cmd->region_map);
- else
- __clear_bit(i, cmd->region_map);
+ __assign_bit(i, cmd->region_map, dm_bitset_cursor_get_value(&c));
if (i >= (cmd->nr_regions - 1))
break;
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index 94b2fc33f64b..e956d980672c 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -1181,8 +1181,7 @@ static void process_deferred_discards(struct clone *clone)
struct bio_list discards = BIO_EMPTY_LIST;
spin_lock_irq(&clone->lock);
- bio_list_merge(&discards, &clone->deferred_discard_bios);
- bio_list_init(&clone->deferred_discard_bios);
+ bio_list_merge_init(&discards, &clone->deferred_discard_bios);
spin_unlock_irq(&clone->lock);
if (bio_list_empty(&discards))
@@ -1215,8 +1214,7 @@ static void process_deferred_bios(struct clone *clone)
struct bio_list bios = BIO_EMPTY_LIST;
spin_lock_irq(&clone->lock);
- bio_list_merge(&bios, &clone->deferred_bios);
- bio_list_init(&clone->deferred_bios);
+ bio_list_merge_init(&bios, &clone->deferred_bios);
spin_unlock_irq(&clone->lock);
if (bio_list_empty(&bios))
@@ -1237,11 +1235,9 @@ static void process_deferred_flush_bios(struct clone *clone)
* before issuing them or signaling their completion.
*/
spin_lock_irq(&clone->lock);
- bio_list_merge(&bios, &clone->deferred_flush_bios);
- bio_list_init(&clone->deferred_flush_bios);
-
- bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
- bio_list_init(&clone->deferred_flush_completions);
+ bio_list_merge_init(&bios, &clone->deferred_flush_bios);
+ bio_list_merge_init(&bio_completions,
+ &clone->deferred_flush_completions);
spin_unlock_irq(&clone->lock);
if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
@@ -2024,7 +2020,7 @@ static void clone_resume(struct dm_target *ti)
static void disable_passdown_if_not_supported(struct clone *clone)
{
struct block_device *dest_dev = clone->dest_dev->bdev;
- struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
+ struct queue_limits *dest_limits = bdev_limits(dest_dev);
const char *reason = NULL;
if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
@@ -2045,12 +2041,13 @@ static void disable_passdown_if_not_supported(struct clone *clone)
static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
{
struct block_device *dest_bdev = clone->dest_dev->bdev;
- struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
+ struct queue_limits *dest_limits = bdev_limits(dest_bdev);
if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
/* No passdown is done so we set our own virtual limits */
limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
- limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size);
+ limits->max_hw_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT,
+ clone->region_size);
return;
}
@@ -2059,11 +2056,9 @@ static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
* device limits but discards aren't passed to the source device, so
* inherit destination's limits.
*/
- limits->max_discard_sectors = dest_limits->max_discard_sectors;
limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
limits->discard_granularity = dest_limits->discard_granularity;
limits->discard_alignment = dest_limits->discard_alignment;
- limits->discard_misaligned = dest_limits->discard_misaligned;
limits->max_discard_segments = dest_limits->max_discard_segments;
}
@@ -2078,8 +2073,8 @@ static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits)
*/
if (io_opt_sectors < clone->region_size ||
do_div(io_opt_sectors, clone->region_size)) {
- blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT);
- blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT);
+ limits->io_min = clone->region_size << SECTOR_SHIFT;
+ limits->io_opt = clone->region_size << SECTOR_SHIFT;
}
disable_passdown_if_not_supported(clone);
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index e6757a30dcca..3637761f3585 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -140,7 +140,7 @@ struct mapped_device {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_zones;
- unsigned int *zwp_offset;
+ void *zone_revalidate_map;
#endif
#ifdef CONFIG_IMA
@@ -206,7 +206,8 @@ struct dm_table {
bool integrity_supported:1;
bool singleton:1;
- unsigned integrity_added:1;
+ /* set if all the targets in the table have "flush_bypasses_map" set */
+ bool flush_bypasses_map:1;
/*
* Indicates the rw permissions for the new logical device. This
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9a74c6316c5d..02a2919f4e5a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -28,7 +28,7 @@
#include <linux/rbtree.h>
#include <linux/ctype.h>
#include <asm/page.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <crypto/hash.h>
#include <crypto/md5.h>
#include <crypto/skcipher.h>
@@ -47,6 +47,8 @@
#define DM_MSG_PREFIX "crypt"
+static DEFINE_IDA(workqueue_ida);
+
/*
* context holding the current state of a multi-part conversion
*/
@@ -57,6 +59,7 @@ struct convert_context {
struct bio *bio_out;
struct bvec_iter iter_out;
atomic_t cc_pending;
+ unsigned int tag_offset;
u64 cc_sector;
union {
struct skcipher_request *req;
@@ -137,14 +140,15 @@ struct iv_elephant_private {
* and encrypts / decrypts at the same time.
*/
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
- DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD,
- DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE,
- DM_CRYPT_WRITE_INLINE };
+ DM_CRYPT_SAME_CPU, DM_CRYPT_HIGH_PRIORITY,
+ DM_CRYPT_NO_OFFLOAD, DM_CRYPT_NO_READ_WORKQUEUE,
+ DM_CRYPT_NO_WRITE_WORKQUEUE, DM_CRYPT_WRITE_INLINE };
enum cipher_flags {
CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cipher */
CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */
CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */
+ CRYPT_KEY_MAC_SIZE_SET, /* The integrity_key_size option was used */
};
/*
@@ -184,6 +188,7 @@ struct crypt_config {
struct crypto_aead **tfms_aead;
} cipher_tfm;
unsigned int tfms_count;
+ int workqueue_id;
unsigned long cipher_flags;
/*
@@ -211,7 +216,8 @@ struct crypt_config {
unsigned int integrity_tag_size;
unsigned int integrity_iv_size;
- unsigned int on_disk_tag_size;
+ unsigned int used_tag_size;
+ unsigned int tuple_size;
/*
* pool for per bio private data, crypto requests,
@@ -238,6 +244,31 @@ static unsigned int dm_crypt_clients_n;
static volatile unsigned long dm_crypt_pages_per_client;
#define DM_CRYPT_MEMORY_PERCENT 2
#define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_VECS * 16)
+#define DM_CRYPT_DEFAULT_MAX_READ_SIZE 131072
+#define DM_CRYPT_DEFAULT_MAX_WRITE_SIZE 131072
+
+static unsigned int max_read_size = 0;
+module_param(max_read_size, uint, 0644);
+MODULE_PARM_DESC(max_read_size, "Maximum size of a read request");
+static unsigned int max_write_size = 0;
+module_param(max_write_size, uint, 0644);
+MODULE_PARM_DESC(max_write_size, "Maximum size of a write request");
+static unsigned get_max_request_size(struct crypt_config *cc, bool wrt)
+{
+ unsigned val, sector_align;
+ val = !wrt ? READ_ONCE(max_read_size) : READ_ONCE(max_write_size);
+ if (likely(!val))
+ val = !wrt ? DM_CRYPT_DEFAULT_MAX_READ_SIZE : DM_CRYPT_DEFAULT_MAX_WRITE_SIZE;
+ if (wrt || cc->used_tag_size) {
+ if (unlikely(val > BIO_MAX_VECS << PAGE_SHIFT))
+ val = BIO_MAX_VECS << PAGE_SHIFT;
+ }
+ sector_align = max(bdev_logical_block_size(cc->dev->bdev), (unsigned)cc->sector_size);
+ val = round_down(val, sector_align);
+ if (unlikely(!val))
+ val = sector_align;
+ return val >> SECTOR_SHIFT;
+}
static void crypt_endio(struct bio *clone);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
@@ -1148,16 +1179,16 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
unsigned int tag_len;
int ret;
- if (!bio_sectors(bio) || !io->cc->on_disk_tag_size)
+ if (!bio_sectors(bio) || !io->cc->tuple_size)
return 0;
bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
if (IS_ERR(bip))
return PTR_ERR(bip);
- tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift);
+ tag_len = io->cc->tuple_size * (bio_sectors(bio) >> io->cc->sector_shift);
- bip->bip_iter.bi_sector = io->cc->start + io->sector;
+ bip->bip_iter.bi_sector = bio->bi_iter.bi_sector;
ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
tag_len, offset_in_page(io->integrity_metadata));
@@ -1173,24 +1204,24 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
struct mapped_device *md = dm_table_get_md(ti->table);
- /* From now we require underlying device with our integrity profile */
- if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
+ /* We require an underlying device with non-PI metadata */
+ if (!bi || bi->csum_type != BLK_INTEGRITY_CSUM_NONE) {
ti->error = "Integrity profile not supported.";
return -EINVAL;
}
- if (bi->tag_size != cc->on_disk_tag_size ||
- bi->tuple_size != cc->on_disk_tag_size) {
+ if (bi->tuple_size < cc->used_tag_size) {
ti->error = "Integrity profile tag size mismatch.";
return -EINVAL;
}
+ cc->tuple_size = bi->tuple_size;
if (1 << bi->interval_exp != cc->sector_size) {
ti->error = "Integrity profile sector size mismatch.";
return -EINVAL;
}
if (crypt_integrity_aead(cc)) {
- cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
+ cc->integrity_tag_size = cc->used_tag_size - cc->integrity_iv_size;
DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md),
cc->integrity_tag_size, cc->integrity_iv_size);
@@ -1202,7 +1233,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md),
cc->integrity_iv_size);
- if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
+ if ((cc->integrity_tag_size + cc->integrity_iv_size) > cc->tuple_size) {
ti->error = "Not enough space for integrity tag in the profile.";
return -EINVAL;
}
@@ -1226,6 +1257,7 @@ static void crypt_convert_init(struct crypt_config *cc,
if (bio_out)
ctx->iter_out = bio_out->bi_iter;
ctx->cc_sector = sector + cc->iv_offset;
+ ctx->tag_offset = 0;
init_completion(&ctx->restart);
}
@@ -1281,7 +1313,7 @@ static void *tag_from_dmreq(struct crypt_config *cc,
struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) *
- cc->on_disk_tag_size];
+ cc->tuple_size];
}
static void *iv_tag_from_dmreq(struct crypt_config *cc,
@@ -1362,9 +1394,9 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
cc->sector_size, iv);
r = crypto_aead_encrypt(req);
- if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size)
+ if (cc->integrity_tag_size + cc->integrity_iv_size != cc->tuple_size)
memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0,
- cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size));
+ cc->tuple_size - (cc->integrity_tag_size + cc->integrity_iv_size));
} else {
aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
cc->sector_size + cc->integrity_tag_size, iv);
@@ -1558,7 +1590,6 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_
static blk_status_t crypt_convert(struct crypt_config *cc,
struct convert_context *ctx, bool atomic, bool reset_pending)
{
- unsigned int tag_offset = 0;
unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
int r;
@@ -1581,9 +1612,9 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
atomic_inc(&ctx->cc_pending);
if (crypt_integrity_aead(cc))
- r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset);
+ r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, ctx->tag_offset);
else
- r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset);
+ r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, ctx->tag_offset);
switch (r) {
/*
@@ -1603,8 +1634,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
* exit and continue processing in a workqueue
*/
ctx->r.req = NULL;
+ ctx->tag_offset++;
ctx->cc_sector += sector_step;
- tag_offset++;
return BLK_STS_DEV_RESOURCE;
}
} else {
@@ -1618,8 +1649,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
*/
case -EINPROGRESS:
ctx->r.req = NULL;
+ ctx->tag_offset++;
ctx->cc_sector += sector_step;
- tag_offset++;
continue;
/*
* The request was already processed (synchronously).
@@ -1627,7 +1658,7 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
case 0:
atomic_dec(&ctx->cc_pending);
ctx->cc_sector += sector_step;
- tag_offset++;
+ ctx->tag_offset++;
if (!atomic)
cond_resched();
continue;
@@ -1653,8 +1684,8 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
/*
* Generate a new unfragmented bio with the given size
- * This should never violate the device limitations (but only because
- * max_segment_size is being constrained to PAGE_SIZE).
+ * This should never violate the device limitations (but if it did then block
+ * core should split the bio as needed).
*
* This function may be called concurrently. If we allocate from the mempool
* concurrently, there is a possibility of deadlock. For example, if we have
@@ -1689,6 +1720,7 @@ retry:
clone->bi_private = io;
clone->bi_end_io = crypt_endio;
clone->bi_ioprio = io->base_bio->bi_ioprio;
+ clone->bi_iter.bi_sector = cc->start + io->sector;
remaining_size = size;
@@ -1794,7 +1826,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
return;
if (likely(!io->ctx.aead_recheck) && unlikely(io->ctx.aead_failed) &&
- cc->on_disk_tag_size && bio_data_dir(base_bio) == READ) {
+ cc->used_tag_size && bio_data_dir(base_bio) == READ) {
io->ctx.aead_recheck = true;
io->ctx.aead_failed = false;
io->error = 0;
@@ -1879,7 +1911,6 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
crypt_dec_pending(io);
return 1;
}
- clone->bi_iter.bi_sector = cc->start + io->sector;
crypt_convert_init(cc, &io->ctx, clone, clone, io->sector);
io->saved_bi_iter = clone->bi_iter;
dm_submit_bio_remap(io->base_bio, clone);
@@ -1895,13 +1926,13 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
clone = bio_alloc_clone(cc->dev->bdev, io->base_bio, gfp, &cc->bs);
if (!clone)
return 1;
+
+ clone->bi_iter.bi_sector = cc->start + io->sector;
clone->bi_private = io;
clone->bi_end_io = crypt_endio;
crypt_inc_pending(io);
- clone->bi_iter.bi_sector = cc->start + io->sector;
-
if (dm_crypt_integrity_io_alloc(io, clone)) {
crypt_dec_pending(io);
bio_put(clone);
@@ -2009,8 +2040,6 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
/* crypt_convert should have filled the clone bio */
BUG_ON(io->ctx.iter_out.bi_size);
- clone->bi_iter.bi_sector = cc->start + io->sector;
-
if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) ||
test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) {
dm_submit_bio_remap(io->base_bio, clone);
@@ -2062,13 +2091,12 @@ static void kcryptd_crypt_write_continue(struct work_struct *work)
struct crypt_config *cc = io->cc;
struct convert_context *ctx = &io->ctx;
int crypt_finished;
- sector_t sector = io->sector;
blk_status_t r;
wait_for_completion(&ctx->restart);
reinit_completion(&ctx->restart);
- r = crypt_convert(cc, &io->ctx, true, false);
+ r = crypt_convert(cc, &io->ctx, false, false);
if (r)
io->error = r;
crypt_finished = atomic_dec_and_test(&ctx->cc_pending);
@@ -2079,10 +2107,8 @@ static void kcryptd_crypt_write_continue(struct work_struct *work)
}
/* Encryption was already finished, submit io now */
- if (crypt_finished) {
+ if (crypt_finished)
kcryptd_crypt_write_io_submit(io, 0);
- io->sector = sector;
- }
crypt_dec_pending(io);
}
@@ -2093,14 +2119,13 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
struct convert_context *ctx = &io->ctx;
struct bio *clone;
int crypt_finished;
- sector_t sector = io->sector;
blk_status_t r;
/*
* Prevent io from disappearing until this function completes.
*/
crypt_inc_pending(io);
- crypt_convert_init(cc, ctx, NULL, io->base_bio, sector);
+ crypt_convert_init(cc, ctx, NULL, io->base_bio, io->sector);
clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
if (unlikely(!clone)) {
@@ -2117,8 +2142,6 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
io->ctx.iter_in = clone->bi_iter;
}
- sector += bio_sectors(clone);
-
crypt_inc_pending(io);
r = crypt_convert(cc, ctx,
test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags), true);
@@ -2142,10 +2165,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
}
/* Encryption was already finished, submit io now */
- if (crypt_finished) {
+ if (crypt_finished)
kcryptd_crypt_write_io_submit(io, 0);
- io->sector = sector;
- }
dec:
crypt_dec_pending(io);
@@ -2173,7 +2194,7 @@ static void kcryptd_crypt_read_continue(struct work_struct *work)
wait_for_completion(&io->ctx.restart);
reinit_completion(&io->ctx.restart);
- r = crypt_convert(cc, &io->ctx, true, false);
+ r = crypt_convert(cc, &io->ctx, false, false);
if (r)
io->error = r;
@@ -2191,7 +2212,6 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
crypt_inc_pending(io);
if (io->ctx.aead_recheck) {
- io->ctx.cc_sector = io->sector + cc->iv_offset;
r = crypt_convert(cc, &io->ctx,
test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true);
} else {
@@ -2584,35 +2604,31 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
key = request_key(type, key_desc + 1, NULL);
if (IS_ERR(key)) {
- kfree_sensitive(new_key_string);
- return PTR_ERR(key);
+ ret = PTR_ERR(key);
+ goto free_new_key_string;
}
down_read(&key->sem);
-
ret = set_key(cc, key);
- if (ret < 0) {
- up_read(&key->sem);
- key_put(key);
- kfree_sensitive(new_key_string);
- return ret;
- }
-
up_read(&key->sem);
key_put(key);
+ if (ret < 0)
+ goto free_new_key_string;
/* clear the flag since following operations may invalidate previously valid key */
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
ret = crypt_setkey(cc);
+ if (ret)
+ goto free_new_key_string;
- if (!ret) {
- set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
- kfree_sensitive(cc->key_string);
- cc->key_string = new_key_string;
- } else
- kfree_sensitive(new_key_string);
+ set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
+ kfree_sensitive(cc->key_string);
+ cc->key_string = new_key_string;
+ return 0;
+free_new_key_string:
+ kfree_sensitive(new_key_string);
return ret;
}
@@ -2771,6 +2787,9 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->crypt_queue)
destroy_workqueue(cc->crypt_queue);
+ if (cc->workqueue_id)
+ ida_free(&workqueue_ida, cc->workqueue_id);
+
crypt_free_tfms(cc);
bioset_exit(&cc->bs);
@@ -2905,7 +2924,8 @@ static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api)
if (IS_ERR(mac))
return PTR_ERR(mac);
- cc->key_mac_size = crypto_ahash_digestsize(mac);
+ if (!test_bit(CRYPT_KEY_MAC_SIZE_SET, &cc->cipher_flags))
+ cc->key_mac_size = crypto_ahash_digestsize(mac);
crypto_free_ahash(mac);
cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL);
@@ -3134,7 +3154,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
struct crypt_config *cc = ti->private;
struct dm_arg_set as;
static const struct dm_arg _args[] = {
- {0, 8, "Invalid number of feature args"},
+ {0, 9, "Invalid number of feature args"},
};
unsigned int opt_params, val;
const char *opt_string, *sval;
@@ -3161,6 +3181,8 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
else if (!strcasecmp(opt_string, "same_cpu_crypt"))
set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+ else if (!strcasecmp(opt_string, "high_priority"))
+ set_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags);
else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
@@ -3173,7 +3195,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
ti->error = "Invalid integrity arguments";
return -EINVAL;
}
- cc->on_disk_tag_size = val;
+ cc->used_tag_size = val;
sval = strchr(opt_string + strlen("integrity:"), ':') + 1;
if (!strcasecmp(sval, "aead")) {
set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
@@ -3185,6 +3207,13 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
cc->cipher_auth = kstrdup(sval, GFP_KERNEL);
if (!cc->cipher_auth)
return -ENOMEM;
+ } else if (sscanf(opt_string, "integrity_key_size:%u%c", &val, &dummy) == 1) {
+ if (!val) {
+ ti->error = "Invalid integrity_key_size argument";
+ return -EINVAL;
+ }
+ cc->key_mac_size = val;
+ set_bit(CRYPT_KEY_MAC_SIZE_SET, &cc->cipher_flags);
} else if (sscanf(opt_string, "sector_size:%hu%c", &cc->sector_size, &dummy) == 1) {
if (cc->sector_size < (1 << SECTOR_SHIFT) ||
cc->sector_size > 4096 ||
@@ -3230,8 +3259,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct crypt_config *cc;
const char *devname = dm_table_device_name(ti->table);
- int key_size;
+ int key_size, wq_id;
unsigned int align_mask;
+ unsigned int common_wq_flags;
unsigned long long tmpll;
int ret;
size_t iv_size_padding, additional_req_size;
@@ -3384,12 +3414,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (ret)
goto bad;
- cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size;
+ cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->tuple_size;
if (!cc->tag_pool_max_sectors)
cc->tag_pool_max_sectors = 1;
ret = mempool_init_kmalloc_pool(&cc->tag_pool, MIN_IOS,
- cc->tag_pool_max_sectors * cc->on_disk_tag_size);
+ cc->tag_pool_max_sectors * cc->tuple_size);
if (ret) {
ti->error = "Cannot allocate integrity tags mempool";
goto bad;
@@ -3398,20 +3428,38 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->tag_pool_max_sectors <<= cc->sector_shift;
}
+ wq_id = ida_alloc_min(&workqueue_ida, 1, GFP_KERNEL);
+ if (wq_id < 0) {
+ ti->error = "Couldn't get workqueue id";
+ ret = wq_id;
+ goto bad;
+ }
+ cc->workqueue_id = wq_id;
+
ret = -ENOMEM;
- cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname);
+ common_wq_flags = WQ_MEM_RECLAIM | WQ_SYSFS;
+ if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags))
+ common_wq_flags |= WQ_HIGHPRI;
+
+ cc->io_queue = alloc_workqueue("kcryptd_io-%s-%d", common_wq_flags, 1, devname, wq_id);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
- if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
- cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
- 1, devname);
- else
- cc->crypt_queue = alloc_workqueue("kcryptd/%s",
- WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
- num_online_cpus(), devname);
+ if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) {
+ cc->crypt_queue = alloc_workqueue("kcryptd-%s-%d",
+ common_wq_flags | WQ_CPU_INTENSIVE,
+ 1, devname, wq_id);
+ } else {
+ /*
+ * While crypt_queue is certainly CPU intensive, the use of
+ * WQ_CPU_INTENSIVE is meaningless with WQ_UNBOUND.
+ */
+ cc->crypt_queue = alloc_workqueue("kcryptd-%s-%d",
+ common_wq_flags | WQ_UNBOUND,
+ num_online_cpus(), devname, wq_id);
+ }
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
goto bad;
@@ -3427,6 +3475,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->error = "Couldn't spawn write thread";
goto bad;
}
+ if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags))
+ set_user_nice(cc->write_thread, MIN_NICE);
ti->num_flush_bios = 1;
ti->limit_swap_bios = true;
@@ -3445,6 +3495,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
{
struct dm_crypt_io *io;
struct crypt_config *cc = ti->private;
+ unsigned max_sectors;
/*
* If bio is REQ_PREFLUSH or REQ_OP_DISCARD, just bypass crypt queues.
@@ -3463,9 +3514,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
/*
* Check if bio is too large, split as needed.
*/
- if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_VECS << PAGE_SHIFT)) &&
- (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size))
- dm_accept_partial_bio(bio, ((BIO_MAX_VECS << PAGE_SHIFT) >> SECTOR_SHIFT));
+ max_sectors = get_max_request_size(cc, bio_data_dir(bio) == WRITE);
+ if (unlikely(bio_sectors(bio) > max_sectors))
+ dm_accept_partial_bio(bio, max_sectors);
/*
* Ensure that bio is a multiple of internal sector encryption size
@@ -3480,8 +3531,8 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
io = dm_per_bio_data(bio, cc->per_bio_data_size);
crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
- if (cc->on_disk_tag_size) {
- unsigned int tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);
+ if (cc->tuple_size) {
+ unsigned int tag_len = cc->tuple_size * (bio_sectors(bio) >> cc->sector_shift);
if (unlikely(tag_len > KMALLOC_MAX_SIZE))
io->integrity_metadata = NULL;
@@ -3547,31 +3598,36 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
num_feature_args += !!ti->num_discard_bios;
num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+ num_feature_args += test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags);
num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
num_feature_args += test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
+ num_feature_args += !!cc->used_tag_size;
num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT);
num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
- if (cc->on_disk_tag_size)
- num_feature_args++;
+ num_feature_args += test_bit(CRYPT_KEY_MAC_SIZE_SET, &cc->cipher_flags);
if (num_feature_args) {
DMEMIT(" %d", num_feature_args);
if (ti->num_discard_bios)
DMEMIT(" allow_discards");
if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
DMEMIT(" same_cpu_crypt");
+ if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags))
+ DMEMIT(" high_priority");
if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
DMEMIT(" submit_from_crypt_cpus");
if (test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags))
DMEMIT(" no_read_workqueue");
if (test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags))
DMEMIT(" no_write_workqueue");
- if (cc->on_disk_tag_size)
- DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth);
+ if (cc->used_tag_size)
+ DMEMIT(" integrity:%u:%s", cc->used_tag_size, cc->cipher_auth);
if (cc->sector_size != (1 << SECTOR_SHIFT))
DMEMIT(" sector_size:%d", cc->sector_size);
if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
DMEMIT(" iv_large_sectors");
+ if (test_bit(CRYPT_KEY_MAC_SIZE_SET, &cc->cipher_flags))
+ DMEMIT(" integrity_key_size:%u", cc->key_mac_size);
}
break;
@@ -3579,6 +3635,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
DMEMIT_TARGET_NAME_VERSION(ti->type);
DMEMIT(",allow_discards=%c", ti->num_discard_bios ? 'y' : 'n');
DMEMIT(",same_cpu_crypt=%c", test_bit(DM_CRYPT_SAME_CPU, &cc->flags) ? 'y' : 'n');
+ DMEMIT(",high_priority=%c", test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags) ? 'y' : 'n');
DMEMIT(",submit_from_crypt_cpus=%c", test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags) ?
'y' : 'n');
DMEMIT(",no_read_workqueue=%c", test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags) ?
@@ -3588,9 +3645,9 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
DMEMIT(",iv_large_sectors=%c", test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags) ?
'y' : 'n');
- if (cc->on_disk_tag_size)
+ if (cc->used_tag_size)
DMEMIT(",integrity_tag_size=%u,cipher_auth=%s",
- cc->on_disk_tag_size, cc->cipher_auth);
+ cc->used_tag_size, cc->cipher_auth);
if (cc->sector_size != (1 << SECTOR_SHIFT))
DMEMIT(",sector_size=%d", cc->sector_size);
if (cc->cipher_string)
@@ -3688,14 +3745,6 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct crypt_config *cc = ti->private;
- /*
- * Unfortunate constraint that is required to avoid the potential
- * for exceeding underlying device's max_segments limits -- due to
- * crypt_alloc_buffer() possibly allocating pages for the encryption
- * bio that are not as physically contiguous as the original bio.
- */
- limits->max_segment_size = PAGE_SIZE;
-
limits->logical_block_size =
max_t(unsigned int, limits->logical_block_size, cc->sector_size);
limits->physical_block_size =
@@ -3706,7 +3755,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 25, 0},
+ .version = {1, 28, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 5eabdb06c649..08f6387620c1 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -28,7 +28,8 @@ struct delay_class {
struct delay_c {
struct timer_list delay_timer;
- struct mutex timer_lock;
+ struct mutex process_bios_lock; /* hold while removing bios to be processed from list */
+ spinlock_t delayed_bios_lock; /* hold on all accesses to delayed_bios list */
struct workqueue_struct *kdelayd_wq;
struct work_struct flush_expired_bios;
struct list_head delayed_bios;
@@ -49,8 +50,6 @@ struct dm_delay_info {
unsigned long expires;
};
-static DEFINE_MUTEX(delayed_bios_lock);
-
static void handle_delayed_timer(struct timer_list *t)
{
struct delay_c *dc = from_timer(dc, t, delay_timer);
@@ -60,12 +59,7 @@ static void handle_delayed_timer(struct timer_list *t)
static void queue_timeout(struct delay_c *dc, unsigned long expires)
{
- mutex_lock(&dc->timer_lock);
-
- if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires)
- mod_timer(&dc->delay_timer, expires);
-
- mutex_unlock(&dc->timer_lock);
+ timer_reduce(&dc->delay_timer, expires);
}
static inline bool delay_is_fast(struct delay_c *dc)
@@ -89,12 +83,16 @@ static void flush_delayed_bios(struct delay_c *dc, bool flush_all)
{
struct dm_delay_info *delayed, *next;
struct bio_list flush_bio_list;
+ LIST_HEAD(local_list);
unsigned long next_expires = 0;
bool start_timer = false;
bio_list_init(&flush_bio_list);
- mutex_lock(&delayed_bios_lock);
- list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
+ mutex_lock(&dc->process_bios_lock);
+ spin_lock(&dc->delayed_bios_lock);
+ list_replace_init(&dc->delayed_bios, &local_list);
+ spin_unlock(&dc->delayed_bios_lock);
+ list_for_each_entry_safe(delayed, next, &local_list, list) {
cond_resched();
if (flush_all || time_after_eq(jiffies, delayed->expires)) {
struct bio *bio = dm_bio_from_per_bio_data(delayed,
@@ -114,7 +112,10 @@ static void flush_delayed_bios(struct delay_c *dc, bool flush_all)
}
}
}
- mutex_unlock(&delayed_bios_lock);
+ spin_lock(&dc->delayed_bios_lock);
+ list_splice(&local_list, &dc->delayed_bios);
+ spin_unlock(&dc->delayed_bios_lock);
+ mutex_unlock(&dc->process_bios_lock);
if (start_timer)
queue_timeout(dc, next_expires);
@@ -128,13 +129,13 @@ static int flush_worker_fn(void *data)
while (!kthread_should_stop()) {
flush_delayed_bios(dc, false);
- mutex_lock(&delayed_bios_lock);
+ spin_lock(&dc->delayed_bios_lock);
if (unlikely(list_empty(&dc->delayed_bios))) {
set_current_state(TASK_INTERRUPTIBLE);
- mutex_unlock(&delayed_bios_lock);
+ spin_unlock(&dc->delayed_bios_lock);
schedule();
} else {
- mutex_unlock(&delayed_bios_lock);
+ spin_unlock(&dc->delayed_bios_lock);
cond_resched();
}
}
@@ -154,8 +155,10 @@ static void delay_dtr(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
- if (dc->kdelayd_wq)
+ if (dc->kdelayd_wq) {
+ timer_shutdown_sync(&dc->delay_timer);
destroy_workqueue(dc->kdelayd_wq);
+ }
if (dc->read.dev)
dm_put_device(ti, dc->read.dev);
@@ -166,7 +169,7 @@ static void delay_dtr(struct dm_target *ti)
if (dc->worker)
kthread_stop(dc->worker);
- mutex_destroy(&dc->timer_lock);
+ mutex_destroy(&dc->process_bios_lock);
kfree(dc);
}
@@ -224,7 +227,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->private = dc;
INIT_LIST_HEAD(&dc->delayed_bios);
- mutex_init(&dc->timer_lock);
+ mutex_init(&dc->process_bios_lock);
+ spin_lock_init(&dc->delayed_bios_lock);
dc->may_delay = true;
dc->argc = argc;
@@ -240,19 +244,18 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ret = delay_class_ctr(ti, &dc->flush, argv);
if (ret)
goto bad;
- max_delay = max(max_delay, dc->write.delay);
- max_delay = max(max_delay, dc->flush.delay);
goto out;
}
ret = delay_class_ctr(ti, &dc->write, argv + 3);
if (ret)
goto bad;
+ max_delay = max(max_delay, dc->write.delay);
+
if (argc == 6) {
ret = delay_class_ctr(ti, &dc->flush, argv + 3);
if (ret)
goto bad;
- max_delay = max(max_delay, dc->flush.delay);
goto out;
}
@@ -267,8 +270,7 @@ out:
* In case of small requested delays, use kthread instead of
* timers and workqueue to achieve better latency.
*/
- dc->worker = kthread_create(&flush_worker_fn, dc,
- "dm-delay-flush-worker");
+ dc->worker = kthread_run(&flush_worker_fn, dc, "dm-delay-flush-worker");
if (IS_ERR(dc->worker)) {
ret = PTR_ERR(dc->worker);
dc->worker = NULL;
@@ -309,14 +311,14 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
delayed->context = dc;
delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay);
- mutex_lock(&delayed_bios_lock);
+ spin_lock(&dc->delayed_bios_lock);
if (unlikely(!dc->may_delay)) {
- mutex_unlock(&delayed_bios_lock);
+ spin_unlock(&dc->delayed_bios_lock);
return DM_MAPIO_REMAPPED;
}
c->ops++;
list_add_tail(&delayed->list, &dc->delayed_bios);
- mutex_unlock(&delayed_bios_lock);
+ spin_unlock(&dc->delayed_bios_lock);
if (delay_is_fast(dc))
wake_up_process(dc->worker);
@@ -330,12 +332,12 @@ static void delay_presuspend(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
- mutex_lock(&delayed_bios_lock);
+ spin_lock(&dc->delayed_bios_lock);
dc->may_delay = false;
- mutex_unlock(&delayed_bios_lock);
+ spin_unlock(&dc->delayed_bios_lock);
if (!delay_is_fast(dc))
- del_timer_sync(&dc->delay_timer);
+ timer_delete(&dc->delay_timer);
flush_delayed_bios(dc, true);
}
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index b70d4016c2ac..18ae45dcbfb2 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -428,7 +428,7 @@ static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits)
limits->logical_block_size = to_bytes(ec->e_bs);
limits->physical_block_size = to_bytes(ec->u_bs);
limits->alignment_offset = limits->physical_block_size;
- blk_limits_io_min(limits, limits->logical_block_size);
+ limits->io_min = limits->logical_block_size;
}
static int ebs_iterate_devices(struct dm_target *ti,
@@ -442,7 +442,7 @@ static int ebs_iterate_devices(struct dm_target *ti,
static struct target_type ebs_target = {
.name = "ebs",
.version = {1, 0, 1},
- .features = DM_TARGET_PASSES_INTEGRITY,
+ .features = 0,
.module = THIS_MODULE,
.ctr = ebs_ctr,
.dtr = ebs_dtr,
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 6acfa5bf97a4..9c84e9d13eca 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -196,7 +196,7 @@ struct superblock_disk {
* Superblock validation
*--------------------------------------------------------------
*/
-static void sb_prepare_for_write(struct dm_block_validator *v,
+static void sb_prepare_for_write(const struct dm_block_validator *v,
struct dm_block *b,
size_t sb_block_size)
{
@@ -221,7 +221,7 @@ static int check_metadata_version(struct superblock_disk *disk)
return 0;
}
-static int sb_check(struct dm_block_validator *v,
+static int sb_check(const struct dm_block_validator *v,
struct dm_block *b,
size_t sb_block_size)
{
@@ -254,7 +254,7 @@ static int sb_check(struct dm_block_validator *v,
return check_metadata_version(disk);
}
-static struct dm_block_validator sb_validator = {
+static const struct dm_block_validator sb_validator = {
.name = "superblock",
.prepare_for_write = sb_prepare_for_write,
.check = sb_check
@@ -1272,8 +1272,7 @@ static void process_deferred_bios(struct era *era)
bio_list_init(&marked_bios);
spin_lock(&era->deferred_lock);
- bio_list_merge(&deferred_bios, &era->deferred_bios);
- bio_list_init(&era->deferred_bios);
+ bio_list_merge_init(&deferred_bios, &era->deferred_bios);
spin_unlock(&era->deferred_lock);
if (bio_list_empty(&deferred_bios))
@@ -1734,8 +1733,8 @@ static void era_io_hints(struct dm_target *ti, struct queue_limits *limits)
*/
if (io_opt_sectors < era->sectors_per_block ||
do_div(io_opt_sectors, era->sectors_per_block)) {
- blk_limits_io_min(limits, 0);
- blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT);
+ limits->io_min = 0;
+ limits->io_opt = era->sectors_per_block << SECTOR_SHIFT;
}
}
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
index 2a71bcdba92d..b37bbe762500 100644
--- a/drivers/md/dm-init.c
+++ b/drivers/md/dm-init.c
@@ -212,8 +212,10 @@ static char __init *dm_parse_device_entry(struct dm_device *dev, char *str)
strscpy(dev->dmi.uuid, field[1], sizeof(dev->dmi.uuid));
/* minor */
if (strlen(field[2])) {
- if (kstrtoull(field[2], 0, &dev->dmi.dev))
+ if (kstrtoull(field[2], 0, &dev->dmi.dev) ||
+ dev->dmi.dev >= (1 << MINORBITS))
return ERR_PTR(-EINVAL);
+ dev->dmi.dev = huge_encode_dev((dev_t)dev->dmi.dev);
dev->dmi.flags |= DM_PERSISTENT_DEV_FLAG;
}
/* flags */
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 7f3dc8ee6ab8..ee9f7cecd78e 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -44,6 +44,7 @@
#define BITMAP_FLUSH_INTERVAL (10 * HZ)
#define DISCARD_FILLER 0xf6
#define SALT_SIZE 16
+#define RECHECK_POOL_SIZE 256
/*
* Warning - DEBUG_PRINT prints security-sensitive data to the log,
@@ -62,6 +63,7 @@
#define SB_VERSION_3 3
#define SB_VERSION_4 4
#define SB_VERSION_5 5
+#define SB_VERSION_6 6
#define SB_SECTORS 8
#define MAX_SECTORS_PER_BLOCK 8
@@ -86,6 +88,7 @@ struct superblock {
#define SB_FLAG_DIRTY_BITMAP 0x4
#define SB_FLAG_FIXED_PADDING 0x8
#define SB_FLAG_FIXED_HMAC 0x10
+#define SB_FLAG_INLINE 0x20
#define JOURNAL_ENTRY_ROUNDUP 8
@@ -166,6 +169,7 @@ struct dm_integrity_c {
struct dm_dev *meta_dev;
unsigned int tag_size;
__s8 log2_tag_size;
+ unsigned int tuple_size;
sector_t start;
mempool_t journal_io_mempool;
struct dm_io_client *io;
@@ -279,6 +283,8 @@ struct dm_integrity_c {
atomic64_t number_of_mismatches;
mempool_t recheck_pool;
+ struct bio_set recheck_bios;
+ struct bio_set recalc_bios;
struct notifier_block reboot_notifier;
};
@@ -314,6 +320,11 @@ struct dm_integrity_io {
struct completion *completion;
struct dm_bio_details bio_details;
+
+ char *integrity_payload;
+ unsigned payload_len;
+ bool integrity_payload_from_mempool;
+ bool integrity_range_locked;
};
struct journal_completion {
@@ -350,26 +361,8 @@ static struct kmem_cache *journal_io_cache;
#define DEBUG_bytes(bytes, len, msg, ...) do { } while (0)
#endif
-static void dm_integrity_prepare(struct request *rq)
-{
-}
-
-static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes)
-{
-}
-
-/*
- * DM Integrity profile, protection is performed layer above (dm-crypt)
- */
-static const struct blk_integrity_profile dm_integrity_profile = {
- .name = "DM-DIF-EXT-TAG",
- .generate_fn = NULL,
- .verify_fn = NULL,
- .prepare_fn = dm_integrity_prepare,
- .complete_fn = dm_integrity_complete,
-};
-
static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
+static int dm_integrity_map_inline(struct dm_integrity_io *dio, bool from_map);
static void integrity_bio_wait(struct work_struct *w);
static void dm_integrity_dtr(struct dm_target *ti);
@@ -479,7 +472,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned int *sec_ptr)
static void sb_set_version(struct dm_integrity_c *ic)
{
- if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC))
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_INLINE))
+ ic->sb->version = SB_VERSION_6;
+ else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC))
ic->sb->version = SB_VERSION_5;
else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING))
ic->sb->version = SB_VERSION_4;
@@ -499,7 +494,8 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr)
__u8 *sb = (__u8 *)ic->sb;
__u8 *mac = sb + (1 << SECTOR_SHIFT) - mac_size;
- if (sizeof(struct superblock) + mac_size > 1 << SECTOR_SHIFT) {
+ if (sizeof(struct superblock) + mac_size > 1 << SECTOR_SHIFT ||
+ mac_size > HASH_MAX_DIGESTSIZE) {
dm_integrity_io_error(ic, "digest is too long", -EINVAL);
return -EINVAL;
}
@@ -1508,15 +1504,15 @@ static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_dat
if (!ic->meta_dev)
flush_data = false;
if (flush_data) {
- fr.io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
- fr.io_req.mem.type = DM_IO_KMEM,
- fr.io_req.mem.ptr.addr = NULL,
- fr.io_req.notify.fn = flush_notify,
+ fr.io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ fr.io_req.mem.type = DM_IO_KMEM;
+ fr.io_req.mem.ptr.addr = NULL;
+ fr.io_req.notify.fn = flush_notify;
fr.io_req.notify.context = &fr;
- fr.io_req.client = dm_bufio_get_dm_io_client(ic->bufio),
- fr.io_reg.bdev = ic->dev->bdev,
- fr.io_reg.sector = 0,
- fr.io_reg.count = 0,
+ fr.io_req.client = dm_bufio_get_dm_io_client(ic->bufio);
+ fr.io_reg.bdev = ic->dev->bdev;
+ fr.io_reg.sector = 0;
+ fr.io_reg.count = 0;
fr.ic = ic;
init_completion(&fr.comp);
r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL, IOPRIO_DEFAULT);
@@ -1784,7 +1780,7 @@ static void integrity_metadata(struct work_struct *w)
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
char *checksums;
unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
- char checksums_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
+ char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
sector_t sector;
unsigned int sectors_to_process;
@@ -1913,6 +1909,35 @@ error:
dec_in_flight(dio);
}
+static inline bool dm_integrity_check_limits(struct dm_integrity_c *ic, sector_t logical_sector, struct bio *bio)
+{
+ if (unlikely(logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
+ DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
+ logical_sector, bio_sectors(bio),
+ ic->provided_data_sectors);
+ return false;
+ }
+ if (unlikely((logical_sector | bio_sectors(bio)) & (unsigned int)(ic->sectors_per_block - 1))) {
+ DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
+ ic->sectors_per_block,
+ logical_sector, bio_sectors(bio));
+ return false;
+ }
+ if (ic->sectors_per_block > 1 && likely(bio_op(bio) != REQ_OP_DISCARD)) {
+ struct bvec_iter iter;
+ struct bio_vec bv;
+
+ bio_for_each_segment(bv, bio, iter) {
+ if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
+ DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
+ bv.bv_offset, bv.bv_len, ic->sectors_per_block);
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
{
struct dm_integrity_c *ic = ti->private;
@@ -1925,6 +1950,14 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
dio->bi_status = 0;
dio->op = bio_op(bio);
+ if (ic->mode == 'I') {
+ bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector);
+ dio->integrity_payload = NULL;
+ dio->integrity_payload_from_mempool = false;
+ dio->integrity_range_locked = false;
+ return dm_integrity_map_inline(dio, true);
+ }
+
if (unlikely(dio->op == REQ_OP_DISCARD)) {
if (ti->max_io_len) {
sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector);
@@ -1954,31 +1987,8 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
*/
bio->bi_opf &= ~REQ_FUA;
}
- if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
- DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
- dio->range.logical_sector, bio_sectors(bio),
- ic->provided_data_sectors);
- return DM_MAPIO_KILL;
- }
- if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned int)(ic->sectors_per_block - 1))) {
- DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
- ic->sectors_per_block,
- dio->range.logical_sector, bio_sectors(bio));
+ if (unlikely(!dm_integrity_check_limits(ic, dio->range.logical_sector, bio)))
return DM_MAPIO_KILL;
- }
-
- if (ic->sectors_per_block > 1 && likely(dio->op != REQ_OP_DISCARD)) {
- struct bvec_iter iter;
- struct bio_vec bv;
-
- bio_for_each_segment(bv, bio, iter) {
- if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
- DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
- bv.bv_offset, bv.bv_len, ic->sectors_per_block);
- return DM_MAPIO_KILL;
- }
- }
- }
bip = bio_integrity(bio);
if (!ic->internal_hash) {
@@ -2063,7 +2073,7 @@ retry_kmap:
} while (++s < ic->sectors_per_block);
#ifdef INTERNAL_VERIFY
if (ic->internal_hash) {
- char checksums_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
+ char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
@@ -2173,6 +2183,7 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
unsigned int journal_section, journal_entry;
unsigned int journal_read_pos;
+ sector_t recalc_sector;
struct completion read_comp;
bool discard_retried = false;
bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ;
@@ -2313,6 +2324,7 @@ offload_to_thread:
goto lock_retry;
}
}
+ recalc_sector = le64_to_cpu(ic->sb->recalc_sector);
spin_unlock_irq(&ic->endio_wait.lock);
if (unlikely(journal_read_pos != NOT_FOUND)) {
@@ -2367,7 +2379,7 @@ offload_to_thread:
if (need_sync_io) {
wait_for_completion_io(&read_comp);
if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
- dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector))
+ dio->range.logical_sector + dio->range.n_sectors > recalc_sector)
goto skip_check;
if (ic->mode == 'B') {
if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector,
@@ -2394,12 +2406,278 @@ journal_read_write:
do_endio_flush(ic, dio);
}
+static int dm_integrity_map_inline(struct dm_integrity_io *dio, bool from_map)
+{
+ struct dm_integrity_c *ic = dio->ic;
+ struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+ struct bio_integrity_payload *bip;
+ unsigned ret;
+ sector_t recalc_sector;
+
+ if (unlikely(bio_integrity(bio))) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ bio_set_dev(bio, ic->dev->bdev);
+ if (unlikely((bio->bi_opf & REQ_PREFLUSH) != 0))
+ return DM_MAPIO_REMAPPED;
+
+retry:
+ if (!dio->integrity_payload) {
+ unsigned digest_size, extra_size;
+ dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block);
+ digest_size = crypto_shash_digestsize(ic->internal_hash);
+ extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
+ dio->payload_len += extra_size;
+ dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (unlikely(!dio->integrity_payload)) {
+ const unsigned x_size = PAGE_SIZE << 1;
+ if (dio->payload_len > x_size) {
+ unsigned sectors = ((x_size - extra_size) / ic->tuple_size) << ic->sb->log2_sectors_per_block;
+ if (WARN_ON(!sectors || sectors >= bio_sectors(bio))) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+ dm_accept_partial_bio(bio, sectors);
+ goto retry;
+ }
+ }
+ }
+
+ dio->range.logical_sector = bio->bi_iter.bi_sector;
+ dio->range.n_sectors = bio_sectors(bio);
+
+ if (!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)))
+ goto skip_spinlock;
+#ifdef CONFIG_64BIT
+ /*
+ * On 64-bit CPUs we can optimize the lock away (so that it won't cause
+ * cache line bouncing) and use acquire/release barriers instead.
+ *
+ * Paired with smp_store_release in integrity_recalc_inline.
+ */
+ recalc_sector = le64_to_cpu(smp_load_acquire(&ic->sb->recalc_sector));
+ if (likely(dio->range.logical_sector + dio->range.n_sectors <= recalc_sector))
+ goto skip_spinlock;
+#endif
+ spin_lock_irq(&ic->endio_wait.lock);
+ recalc_sector = le64_to_cpu(ic->sb->recalc_sector);
+ if (dio->range.logical_sector + dio->range.n_sectors <= recalc_sector)
+ goto skip_unlock;
+ if (unlikely(!add_new_range(ic, &dio->range, true))) {
+ if (from_map) {
+ spin_unlock_irq(&ic->endio_wait.lock);
+ INIT_WORK(&dio->work, integrity_bio_wait);
+ queue_work(ic->wait_wq, &dio->work);
+ return DM_MAPIO_SUBMITTED;
+ }
+ wait_and_add_new_range(ic, &dio->range);
+ }
+ dio->integrity_range_locked = true;
+skip_unlock:
+ spin_unlock_irq(&ic->endio_wait.lock);
+skip_spinlock:
+
+ if (unlikely(!dio->integrity_payload)) {
+ dio->integrity_payload = page_to_virt((struct page *)mempool_alloc(&ic->recheck_pool, GFP_NOIO));
+ dio->integrity_payload_from_mempool = true;
+ }
+
+ dio->bio_details.bi_iter = bio->bi_iter;
+
+ if (unlikely(!dm_integrity_check_limits(ic, bio->bi_iter.bi_sector, bio))) {
+ return DM_MAPIO_KILL;
+ }
+
+ bio->bi_iter.bi_sector += ic->start + SB_SECTORS;
+
+ bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
+ if (IS_ERR(bip)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(bip));
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ if (dio->op == REQ_OP_WRITE) {
+ unsigned pos = 0;
+ while (dio->bio_details.bi_iter.bi_size) {
+ struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
+ const char *mem = bvec_kmap_local(&bv);
+ if (ic->tag_size < ic->tuple_size)
+ memset(dio->integrity_payload + pos + ic->tag_size, 0, ic->tuple_size - ic->tuple_size);
+ integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, dio->integrity_payload + pos);
+ kunmap_local(mem);
+ pos += ic->tuple_size;
+ bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
+ }
+ }
+
+ ret = bio_integrity_add_page(bio, virt_to_page(dio->integrity_payload),
+ dio->payload_len, offset_in_page(dio->integrity_payload));
+ if (unlikely(ret != dio->payload_len)) {
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static inline void dm_integrity_free_payload(struct dm_integrity_io *dio)
+{
+ struct dm_integrity_c *ic = dio->ic;
+ if (unlikely(dio->integrity_payload_from_mempool))
+ mempool_free(virt_to_page(dio->integrity_payload), &ic->recheck_pool);
+ else
+ kfree(dio->integrity_payload);
+ dio->integrity_payload = NULL;
+ dio->integrity_payload_from_mempool = false;
+}
+
+static void dm_integrity_inline_recheck(struct work_struct *w)
+{
+ struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
+ struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+ struct dm_integrity_c *ic = dio->ic;
+ struct bio *outgoing_bio;
+ void *outgoing_data;
+
+ dio->integrity_payload = page_to_virt((struct page *)mempool_alloc(&ic->recheck_pool, GFP_NOIO));
+ dio->integrity_payload_from_mempool = true;
+
+ outgoing_data = dio->integrity_payload + PAGE_SIZE;
+
+ while (dio->bio_details.bi_iter.bi_size) {
+ char digest[HASH_MAX_DIGESTSIZE];
+ int r;
+ struct bio_integrity_payload *bip;
+ struct bio_vec bv;
+ char *mem;
+
+ outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios);
+
+ r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0);
+ if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) {
+ bio_put(outgoing_bio);
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ return;
+ }
+
+ bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1);
+ if (IS_ERR(bip)) {
+ bio_put(outgoing_bio);
+ bio->bi_status = errno_to_blk_status(PTR_ERR(bip));
+ bio_endio(bio);
+ return;
+ }
+
+ r = bio_integrity_add_page(outgoing_bio, virt_to_page(dio->integrity_payload), ic->tuple_size, 0);
+ if (unlikely(r != ic->tuple_size)) {
+ bio_put(outgoing_bio);
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ return;
+ }
+
+ outgoing_bio->bi_iter.bi_sector = dio->bio_details.bi_iter.bi_sector + ic->start + SB_SECTORS;
+
+ r = submit_bio_wait(outgoing_bio);
+ if (unlikely(r != 0)) {
+ bio_put(outgoing_bio);
+ bio->bi_status = errno_to_blk_status(r);
+ bio_endio(bio);
+ return;
+ }
+ bio_put(outgoing_bio);
+
+ integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest);
+ if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
+ DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
+ ic->dev->bdev, dio->bio_details.bi_iter.bi_sector);
+ atomic64_inc(&ic->number_of_mismatches);
+ dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum",
+ bio, dio->bio_details.bi_iter.bi_sector, 0);
+
+ bio->bi_status = BLK_STS_PROTECTION;
+ bio_endio(bio);
+ return;
+ }
+
+ bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
+ mem = bvec_kmap_local(&bv);
+ memcpy(mem, outgoing_data, ic->sectors_per_block << SECTOR_SHIFT);
+ kunmap_local(mem);
+
+ bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
+ }
+
+ bio_endio(bio);
+}
+
+static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
+{
+ struct dm_integrity_c *ic = ti->private;
+ if (ic->mode == 'I') {
+ struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+ if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) {
+ unsigned pos = 0;
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
+ unlikely(dio->integrity_range_locked))
+ goto skip_check;
+ while (dio->bio_details.bi_iter.bi_size) {
+ char digest[HASH_MAX_DIGESTSIZE];
+ struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
+ char *mem = bvec_kmap_local(&bv);
+ //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT);
+ integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest);
+ if (unlikely(memcmp(digest, dio->integrity_payload + pos,
+ min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
+ kunmap_local(mem);
+ dm_integrity_free_payload(dio);
+ INIT_WORK(&dio->work, dm_integrity_inline_recheck);
+ queue_work(ic->offload_wq, &dio->work);
+ return DM_ENDIO_INCOMPLETE;
+ }
+ kunmap_local(mem);
+ pos += ic->tuple_size;
+ bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
+ }
+ }
+skip_check:
+ dm_integrity_free_payload(dio);
+ if (unlikely(dio->integrity_range_locked))
+ remove_range(ic, &dio->range);
+ }
+ return DM_ENDIO_DONE;
+}
static void integrity_bio_wait(struct work_struct *w)
{
struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
+ struct dm_integrity_c *ic = dio->ic;
- dm_integrity_map_continue(dio, false);
+ if (ic->mode == 'I') {
+ struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+ int r = dm_integrity_map_inline(dio, false);
+ switch (r) {
+ case DM_MAPIO_KILL:
+ bio->bi_status = BLK_STS_IOERR;
+ fallthrough;
+ case DM_MAPIO_REMAPPED:
+ submit_bio_noacct(bio);
+ fallthrough;
+ case DM_MAPIO_SUBMITTED:
+ return;
+ default:
+ BUG();
+ }
+ } else {
+ dm_integrity_map_continue(dio, false);
+ }
}
static void pad_uncommitted(struct dm_integrity_c *ic)
@@ -2432,6 +2710,9 @@ static void integrity_commit(struct work_struct *w)
del_timer(&ic->autocommit_timer);
+ if (ic->mode == 'I')
+ return;
+
spin_lock_irq(&ic->endio_wait.lock);
flushes = bio_list_get(&ic->flush_bio_list);
if (unlikely(ic->mode != 'J')) {
@@ -2626,7 +2907,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start
unlikely(from_replay) &&
#endif
ic->internal_hash) {
- char test_tag[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
+ char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
(char *)access_journal_data(ic, i, l), test_tag);
@@ -2868,6 +3149,133 @@ free_ret:
kvfree(recalc_tags);
}
+static void integrity_recalc_inline(struct work_struct *w)
+{
+ struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work);
+ size_t recalc_tags_size;
+ u8 *recalc_buffer = NULL;
+ u8 *recalc_tags = NULL;
+ struct dm_integrity_range range;
+ struct bio *bio;
+ struct bio_integrity_payload *bip;
+ __u8 *t;
+ unsigned int i;
+ int r;
+ unsigned ret;
+ unsigned int super_counter = 0;
+ unsigned recalc_sectors = RECALC_SECTORS;
+
+retry:
+ recalc_buffer = kmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO | __GFP_NOWARN);
+ if (!recalc_buffer) {
+oom:
+ recalc_sectors >>= 1;
+ if (recalc_sectors >= 1U << ic->sb->log2_sectors_per_block)
+ goto retry;
+ DMCRIT("out of memory for recalculate buffer - recalculation disabled");
+ goto free_ret;
+ }
+
+ recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size;
+ if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size)
+ recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size;
+ recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN);
+ if (!recalc_tags) {
+ kfree(recalc_buffer);
+ recalc_buffer = NULL;
+ goto oom;
+ }
+
+ spin_lock_irq(&ic->endio_wait.lock);
+
+next_chunk:
+ if (unlikely(dm_post_suspending(ic->ti)))
+ goto unlock_ret;
+
+ range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
+ if (unlikely(range.logical_sector >= ic->provided_data_sectors))
+ goto unlock_ret;
+ range.n_sectors = min((sector_t)recalc_sectors, ic->provided_data_sectors - range.logical_sector);
+
+ add_new_range_and_wait(ic, &range);
+ spin_unlock_irq(&ic->endio_wait.lock);
+
+ if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
+ recalc_write_super(ic);
+ super_counter = 0;
+ }
+
+ if (unlikely(dm_integrity_failed(ic)))
+ goto err;
+
+ DEBUG_print("recalculating: %llx - %llx\n", range.logical_sector, range.n_sectors);
+
+ bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios);
+ bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
+ __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+ r = submit_bio_wait(bio);
+ bio_put(bio);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "reading data", r);
+ goto err;
+ }
+
+ t = recalc_tags;
+ for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
+ memset(t, 0, ic->tuple_size);
+ integrity_sector_checksum(ic, range.logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
+ t += ic->tuple_size;
+ }
+
+ bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios);
+ bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
+ __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+
+ bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
+ if (unlikely(IS_ERR(bip))) {
+ bio_put(bio);
+ DMCRIT("out of memory for bio integrity payload - recalculation disabled");
+ goto err;
+ }
+ ret = bio_integrity_add_page(bio, virt_to_page(recalc_tags), t - recalc_tags, offset_in_page(recalc_tags));
+ if (unlikely(ret != t - recalc_tags)) {
+ bio_put(bio);
+ dm_integrity_io_error(ic, "attaching integrity tags", -ENOMEM);
+ goto err;
+ }
+
+ r = submit_bio_wait(bio);
+ bio_put(bio);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "writing data", r);
+ goto err;
+ }
+
+ cond_resched();
+ spin_lock_irq(&ic->endio_wait.lock);
+ remove_range_unlocked(ic, &range);
+#ifdef CONFIG_64BIT
+ /* Paired with smp_load_acquire in dm_integrity_map_inline. */
+ smp_store_release(&ic->sb->recalc_sector, cpu_to_le64(range.logical_sector + range.n_sectors));
+#else
+ ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
+#endif
+ goto next_chunk;
+
+err:
+ remove_range(ic, &range);
+ goto free_ret;
+
+unlock_ret:
+ spin_unlock_irq(&ic->endio_wait.lock);
+
+ recalc_write_super(ic);
+
+free_ret:
+ kfree(recalc_buffer);
+ kfree(recalc_tags);
+}
+
static void bitmap_block_work(struct work_struct *w)
{
struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work);
@@ -3490,9 +3898,21 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim
if (ic->sectors_per_block > 1) {
limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
- blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
+ limits->io_min = ic->sectors_per_block << SECTOR_SHIFT;
limits->dma_alignment = limits->logical_block_size - 1;
+ limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT;
}
+
+ if (!ic->internal_hash) {
+ struct blk_integrity *bi = &limits->integrity;
+
+ memset(bi, 0, sizeof(*bi));
+ bi->tuple_size = ic->tag_size;
+ bi->tag_size = bi->tuple_size;
+ bi->interval_exp =
+ ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
+ }
+
limits->max_integrity_segments = USHRT_MAX;
}
@@ -3522,7 +3942,10 @@ static int calculate_device_limits(struct dm_integrity_c *ic)
return -EINVAL;
ic->initial_sectors = initial_sectors;
- if (!ic->meta_dev) {
+ if (ic->mode == 'I') {
+ if (ic->initial_sectors + ic->provided_data_sectors > ic->meta_device_sectors)
+ return -EINVAL;
+ } else if (!ic->meta_dev) {
sector_t last_sector, last_area, last_offset;
/* we have to maintain excessive padding for compatibility with existing volumes */
@@ -3585,6 +4008,8 @@ static int initialize_superblock(struct dm_integrity_c *ic,
memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT);
memcpy(ic->sb->magic, SB_MAGIC, 8);
+ if (ic->mode == 'I')
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_INLINE);
ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block);
if (ic->journal_mac_alg.alg_string)
@@ -3594,6 +4019,8 @@ static int initialize_superblock(struct dm_integrity_c *ic,
journal_sections = journal_sectors / ic->journal_section_sectors;
if (!journal_sections)
journal_sections = 1;
+ if (ic->mode == 'I')
+ journal_sections = 0;
if (ic->fix_hmac && (ic->internal_hash_alg.alg_string || ic->journal_mac_alg.alg_string)) {
ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_HMAC);
@@ -3649,20 +4076,6 @@ try_smaller_buffer:
return 0;
}
-static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
-{
- struct gendisk *disk = dm_disk(dm_table_get_md(ti->table));
- struct blk_integrity bi;
-
- memset(&bi, 0, sizeof(bi));
- bi.profile = &dm_integrity_profile;
- bi.tuple_size = ic->tag_size;
- bi.tag_size = bi.tuple_size;
- bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
-
- blk_integrity_register(disk, &bi);
-}
-
static void dm_integrity_free_page_list(struct page_list *pl)
{
unsigned int i;
@@ -4156,10 +4569,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
}
if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") ||
- !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) {
+ !strcmp(argv[3], "D") || !strcmp(argv[3], "R") ||
+ !strcmp(argv[3], "I")) {
ic->mode = argv[3][0];
} else {
- ti->error = "Invalid mode (expecting J, B, D, R)";
+ ti->error = "Invalid mode (expecting J, B, D, R, I)";
r = -EINVAL;
goto bad;
}
@@ -4305,6 +4719,53 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
else
ic->log2_tag_size = -1;
+ if (ic->mode == 'I') {
+ struct blk_integrity *bi;
+ if (ic->meta_dev) {
+ r = -EINVAL;
+ ti->error = "Metadata device not supported in inline mode";
+ goto bad;
+ }
+ if (!ic->internal_hash_alg.alg_string) {
+ r = -EINVAL;
+ ti->error = "Internal hash not set in inline mode";
+ goto bad;
+ }
+ if (ic->journal_crypt_alg.alg_string || ic->journal_mac_alg.alg_string) {
+ r = -EINVAL;
+ ti->error = "Journal crypt not supported in inline mode";
+ goto bad;
+ }
+ if (ic->discard) {
+ r = -EINVAL;
+ ti->error = "Discards not supported in inline mode";
+ goto bad;
+ }
+ bi = blk_get_integrity(ic->dev->bdev->bd_disk);
+ if (!bi || bi->csum_type != BLK_INTEGRITY_CSUM_NONE) {
+ r = -EINVAL;
+ ti->error = "Integrity profile not supported";
+ goto bad;
+ }
+ /*printk("tag_size: %u, tuple_size: %u\n", bi->tag_size, bi->tuple_size);*/
+ if (bi->tuple_size < ic->tag_size) {
+ r = -EINVAL;
+ ti->error = "The integrity profile is smaller than tag size";
+ goto bad;
+ }
+ if ((unsigned long)bi->tuple_size > PAGE_SIZE / 2) {
+ r = -EINVAL;
+ ti->error = "Too big tuple size";
+ goto bad;
+ }
+ ic->tuple_size = bi->tuple_size;
+ if (1 << bi->interval_exp != ic->sectors_per_block << SECTOR_SHIFT) {
+ r = -EINVAL;
+ ti->error = "Integrity profile sector size mismatch";
+ goto bad;
+ }
+ }
+
if (ic->mode == 'B' && !ic->internal_hash) {
r = -EINVAL;
ti->error = "Bitmap mode can be only used with internal hash";
@@ -4335,12 +4796,37 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
goto bad;
}
- r = mempool_init_page_pool(&ic->recheck_pool, 1, 0);
+ r = mempool_init_page_pool(&ic->recheck_pool, 1, ic->mode == 'I' ? 1 : 0);
if (r) {
ti->error = "Cannot allocate mempool";
goto bad;
}
+ if (ic->mode == 'I') {
+ r = bioset_init(&ic->recheck_bios, RECHECK_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (r) {
+ ti->error = "Cannot allocate bio set";
+ goto bad;
+ }
+ r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE);
+ if (r) {
+ ti->error = "Cannot allocate bio integrity set";
+ r = -ENOMEM;
+ goto bad;
+ }
+ r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS);
+ if (r) {
+ ti->error = "Cannot allocate bio set";
+ goto bad;
+ }
+ r = bioset_integrity_create(&ic->recalc_bios, 1);
+ if (r) {
+ ti->error = "Cannot allocate bio integrity set";
+ r = -ENOMEM;
+ goto bad;
+ }
+ }
+
ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE);
if (!ic->metadata_wq) {
@@ -4417,11 +4903,16 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
should_write_sb = true;
}
- if (!ic->sb->version || ic->sb->version > SB_VERSION_5) {
+ if (!ic->sb->version || ic->sb->version > SB_VERSION_6) {
r = -EINVAL;
ti->error = "Unknown version";
goto bad;
}
+ if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_INLINE)) != (ic->mode == 'I')) {
+ r = -EINVAL;
+ ti->error = "Inline flag mismatch";
+ goto bad;
+ }
if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) {
r = -EINVAL;
ti->error = "Tag size doesn't match the information in superblock";
@@ -4432,10 +4923,18 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
ti->error = "Block size doesn't match the information in superblock";
goto bad;
}
- if (!le32_to_cpu(ic->sb->journal_sections)) {
- r = -EINVAL;
- ti->error = "Corrupted superblock, journal_sections is 0";
- goto bad;
+ if (ic->mode != 'I') {
+ if (!le32_to_cpu(ic->sb->journal_sections)) {
+ r = -EINVAL;
+ ti->error = "Corrupted superblock, journal_sections is 0";
+ goto bad;
+ }
+ } else {
+ if (le32_to_cpu(ic->sb->journal_sections)) {
+ r = -EINVAL;
+ ti->error = "Corrupted superblock, journal_sections is not 0";
+ goto bad;
+ }
}
/* make sure that ti->max_io_len doesn't overflow */
if (!ic->meta_dev) {
@@ -4486,8 +4985,9 @@ try_smaller_buffer:
bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3);
if (bits_in_journal > UINT_MAX)
bits_in_journal = UINT_MAX;
- while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit)
- log2_sectors_per_bitmap_bit++;
+ if (bits_in_journal)
+ while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit)
+ log2_sectors_per_bitmap_bit++;
log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block;
ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
@@ -4507,7 +5007,6 @@ try_smaller_buffer:
goto bad;
}
-
threshold = (__u64)ic->journal_entries * (100 - journal_watermark);
threshold += 50;
do_div(threshold, 100);
@@ -4542,7 +5041,7 @@ try_smaller_buffer:
r = -ENOMEM;
goto bad;
}
- INIT_WORK(&ic->recalc_work, integrity_recalc);
+ INIT_WORK(&ic->recalc_work, ic->mode == 'I' ? integrity_recalc_inline : integrity_recalc);
} else {
if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
ti->error = "Recalculate can only be specified with internal_hash";
@@ -4569,7 +5068,7 @@ try_smaller_buffer:
}
dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
- if (ic->mode != 'R') {
+ if (ic->mode != 'R' && ic->mode != 'I') {
r = create_journal(ic, &ti->error);
if (r)
goto bad;
@@ -4629,7 +5128,7 @@ try_smaller_buffer:
ic->just_formatted = true;
}
- if (!ic->meta_dev) {
+ if (!ic->meta_dev && ic->mode != 'I') {
r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
if (r)
goto bad;
@@ -4648,14 +5147,14 @@ try_smaller_buffer:
}
}
- if (!ic->internal_hash)
- dm_integrity_set(ti, ic);
-
ti->num_flush_bios = 1;
ti->flush_supported = true;
if (ic->discard)
ti->num_discard_bios = 1;
+ if (ic->mode == 'I')
+ ti->mempool_needs_integrity = true;
+
dm_audit_log_ctr(DM_MSG_PREFIX, ti, 1);
return 0;
@@ -4689,6 +5188,8 @@ static void dm_integrity_dtr(struct dm_target *ti)
kvfree(ic->bbs);
if (ic->bufio)
dm_bufio_client_destroy(ic->bufio);
+ bioset_exit(&ic->recalc_bios);
+ bioset_exit(&ic->recheck_bios);
mempool_exit(&ic->recheck_pool);
mempool_exit(&ic->journal_io_mempool);
if (ic->io)
@@ -4742,12 +5243,13 @@ static void dm_integrity_dtr(struct dm_target *ti)
static struct target_type integrity_target = {
.name = "integrity",
- .version = {1, 11, 0},
+ .version = {1, 13, 0},
.module = THIS_MODULE,
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
.ctr = dm_integrity_ctr,
.dtr = dm_integrity_dtr,
.map = dm_integrity_map,
+ .end_io = dm_integrity_end_io,
.postsuspend = dm_integrity_postsuspend,
.resume = dm_integrity_resume,
.status = dm_integrity_status,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 7409490259d1..c37668790577 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -347,7 +347,7 @@ static void do_region(const blk_opf_t opf, unsigned int region,
break;
default:
num_bvecs = bio_max_segs(dm_sector_div_up(remaining,
- (PAGE_SIZE >> SECTOR_SHIFT)));
+ (PAGE_SIZE >> SECTOR_SHIFT)) + 1);
}
bio = bio_alloc_bioset(where->bdev, num_bvecs, opf, GFP_NOIO,
@@ -379,21 +379,19 @@ static void do_region(const blk_opf_t opf, unsigned int region,
atomic_inc(&io->count);
submit_bio(bio);
+ WARN_ON_ONCE(opf & REQ_ATOMIC && remaining);
} while (remaining);
}
static void dispatch_io(blk_opf_t opf, unsigned int num_regions,
struct dm_io_region *where, struct dpages *dp,
- struct io *io, int sync, unsigned short ioprio)
+ struct io *io, unsigned short ioprio)
{
int i;
struct dpages old_pages = *dp;
BUG_ON(num_regions > DM_IO_MAX_REGIONS);
- if (sync)
- opf |= REQ_SYNC;
-
/*
* For multiple regions we need to be careful to rewind
* the dp object for each call to do_region.
@@ -411,6 +409,26 @@ static void dispatch_io(blk_opf_t opf, unsigned int num_regions,
dec_count(io, 0, 0);
}
+static void async_io(struct dm_io_client *client, unsigned int num_regions,
+ struct dm_io_region *where, blk_opf_t opf,
+ struct dpages *dp, io_notify_fn fn, void *context,
+ unsigned short ioprio)
+{
+ struct io *io;
+
+ io = mempool_alloc(&client->pool, GFP_NOIO);
+ io->error_bits = 0;
+ atomic_set(&io->count, 1); /* see dispatch_io() */
+ io->client = client;
+ io->callback = fn;
+ io->context = context;
+
+ io->vma_invalidate_address = dp->vma_invalidate_address;
+ io->vma_invalidate_size = dp->vma_invalidate_size;
+
+ dispatch_io(opf, num_regions, where, dp, io, ioprio);
+}
+
struct sync_io {
unsigned long error_bits;
struct completion wait;
@@ -428,27 +446,12 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
struct dm_io_region *where, blk_opf_t opf, struct dpages *dp,
unsigned long *error_bits, unsigned short ioprio)
{
- struct io *io;
struct sync_io sio;
- if (num_regions > 1 && !op_is_write(opf)) {
- WARN_ON(1);
- return -EIO;
- }
-
init_completion(&sio.wait);
- io = mempool_alloc(&client->pool, GFP_NOIO);
- io->error_bits = 0;
- atomic_set(&io->count, 1); /* see dispatch_io() */
- io->client = client;
- io->callback = sync_io_complete;
- io->context = &sio;
-
- io->vma_invalidate_address = dp->vma_invalidate_address;
- io->vma_invalidate_size = dp->vma_invalidate_size;
-
- dispatch_io(opf, num_regions, where, dp, io, 1, ioprio);
+ async_io(client, num_regions, where, opf | REQ_SYNC, dp,
+ sync_io_complete, &sio, ioprio);
wait_for_completion_io(&sio.wait);
@@ -458,33 +461,6 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
return sio.error_bits ? -EIO : 0;
}
-static int async_io(struct dm_io_client *client, unsigned int num_regions,
- struct dm_io_region *where, blk_opf_t opf,
- struct dpages *dp, io_notify_fn fn, void *context,
- unsigned short ioprio)
-{
- struct io *io;
-
- if (num_regions > 1 && !op_is_write(opf)) {
- WARN_ON(1);
- fn(1, context);
- return -EIO;
- }
-
- io = mempool_alloc(&client->pool, GFP_NOIO);
- io->error_bits = 0;
- atomic_set(&io->count, 1); /* see dispatch_io() */
- io->client = client;
- io->callback = fn;
- io->context = context;
-
- io->vma_invalidate_address = dp->vma_invalidate_address;
- io->vma_invalidate_size = dp->vma_invalidate_size;
-
- dispatch_io(opf, num_regions, where, dp, io, 0, ioprio);
- return 0;
-}
-
static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
unsigned long size)
{
@@ -529,6 +505,11 @@ int dm_io(struct dm_io_request *io_req, unsigned int num_regions,
int r;
struct dpages dp;
+ if (num_regions > 1 && !op_is_write(io_req->bi_opf)) {
+ WARN_ON(1);
+ return -EIO;
+ }
+
r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT);
if (r)
return r;
@@ -537,9 +518,9 @@ int dm_io(struct dm_io_request *io_req, unsigned int num_regions,
return sync_io(io_req->client, num_regions, where,
io_req->bi_opf, &dp, sync_error_bits, ioprio);
- return async_io(io_req->client, num_regions, where,
- io_req->bi_opf, &dp, io_req->notify.fn,
- io_req->notify.context, ioprio);
+ async_io(io_req->client, num_regions, where, io_req->bi_opf, &dp,
+ io_req->notify.fn, io_req->notify.context, ioprio);
+ return 0;
}
EXPORT_SYMBOL(dm_io);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index c2c07bfa6471..d42eac944eb5 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1181,8 +1181,26 @@ static int do_resume(struct dm_ioctl *param)
suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
if (param->flags & DM_NOFLUSH_FLAG)
suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
- if (!dm_suspended_md(md))
- dm_suspend(md, suspend_flags);
+ if (!dm_suspended_md(md)) {
+ r = dm_suspend(md, suspend_flags);
+ if (r) {
+ down_write(&_hash_lock);
+ hc = dm_get_mdptr(md);
+ if (hc && !hc->new_map) {
+ hc->new_map = new_map;
+ new_map = NULL;
+ } else {
+ r = -ENXIO;
+ }
+ up_write(&_hash_lock);
+ if (new_map) {
+ dm_sync_table(md);
+ dm_table_destroy(new_map);
+ }
+ dm_put(md);
+ return r;
+ }
+ }
old_size = dm_get_size(md);
old_map = dm_swap_table(md, new_map);
@@ -1894,7 +1912,7 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user,
if ((kernel_params->version[0] != DM_VERSION_MAJOR) ||
(kernel_params->version[1] > DM_VERSION_MINOR)) {
- DMERR("ioctl interface mismatch: kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
+ DMERR_LIMIT("ioctl interface mismatch: kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
DM_VERSION_MAJOR, DM_VERSION_MINOR,
DM_VERSION_PATCHLEVEL,
kernel_params->version[0],
@@ -1943,7 +1961,7 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
if (unlikely(param_kernel->data_size < minimum_data_size) ||
unlikely(param_kernel->data_size > DM_MAX_TARGETS * DM_MAX_TARGET_PARAMS)) {
- DMERR("Invalid data size in the ioctl structure: %u",
+ DMERR_LIMIT("Invalid data size in the ioctl structure: %u",
param_kernel->data_size);
return -EINVAL;
}
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 2d3e186ca87e..66318aba4bdb 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -62,6 +62,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_discard_bios = 1;
ti->num_secure_erase_bios = 1;
ti->num_write_zeroes_bios = 1;
+ ti->flush_bypasses_map = true;
ti->private = lc;
return 0;
@@ -198,9 +199,10 @@ static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff,
static struct target_type linear_target = {
.name = "linear",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
- DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO,
+ DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO |
+ DM_TARGET_ATOMIC_WRITES,
.report_zones = linear_report_zones,
.module = THIS_MODULE,
.ctr = linear_ctr,
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index f17a6cf2284e..8d7df8303d0a 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -871,7 +871,7 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit
if (!bdev_max_discard_sectors(lc->dev->bdev)) {
lc->device_supports_discard = false;
limits->discard_granularity = lc->sectorsize;
- limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
+ limits->max_hw_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
}
limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev);
limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 05d1328d1811..637977acc3dc 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -704,8 +704,7 @@ static void process_queued_bios(struct work_struct *work)
return;
}
- bio_list_merge(&bios, &m->queued_bios);
- bio_list_init(&m->queued_bios);
+ bio_list_merge_init(&bios, &m->queued_bios);
spin_unlock_irqrestore(&m->lock, flags);
@@ -1420,8 +1419,7 @@ out:
/*
* Fail or reinstate all paths that match the provided struct dm_dev.
*/
-static int action_dev(struct multipath *m, struct dm_dev *dev,
- action_fn action)
+static int action_dev(struct multipath *m, dev_t dev, action_fn action)
{
int r = -EINVAL;
struct pgpath *pgpath;
@@ -1429,7 +1427,7 @@ static int action_dev(struct multipath *m, struct dm_dev *dev,
list_for_each_entry(pg, &m->priority_groups, list) {
list_for_each_entry(pgpath, &pg->pgpaths, list) {
- if (pgpath->path.dev == dev)
+ if (pgpath->path.dev->bdev->bd_dev == dev)
r = action(pgpath);
}
}
@@ -1960,7 +1958,7 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg
char *result, unsigned int maxlen)
{
int r = -EINVAL;
- struct dm_dev *dev;
+ dev_t dev;
struct multipath *m = ti->private;
action_fn action;
unsigned long flags;
@@ -2009,7 +2007,7 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg
goto out;
}
- r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
+ r = dm_devt_from_path(argv[1], &dev);
if (r) {
DMWARN("message: error getting device %s",
argv[1]);
@@ -2018,8 +2016,6 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg
r = action_dev(m, dev, action);
- dm_put_device(ti, dev);
-
out:
mutex_unlock(&m->work_mutex);
return r;
diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c
index 461ee6b2044d..716807e511ee 100644
--- a/drivers/md/dm-ps-io-affinity.c
+++ b/drivers/md/dm-ps-io-affinity.c
@@ -116,7 +116,7 @@ static int ioa_create(struct path_selector *ps, unsigned int argc, char **argv)
if (!s)
return -ENOMEM;
- s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *),
+ s->path_map = kcalloc(nr_cpu_ids, sizeof(struct path_info *),
GFP_KERNEL);
if (!s->path_map)
goto free_selector;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index abe88d1e6735..6adc55fd90d3 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1626,6 +1626,23 @@ static int _check_data_dev_sectors(struct raid_set *rs)
return 0;
}
+/* Get reshape sectors from data_offsets or raid set */
+static sector_t _get_reshape_sectors(struct raid_set *rs)
+{
+ struct md_rdev *rdev;
+ sector_t reshape_sectors = 0;
+
+ rdev_for_each(rdev, &rs->md)
+ if (!test_bit(Journal, &rdev->flags)) {
+ reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ?
+ rdev->data_offset - rdev->new_data_offset :
+ rdev->new_data_offset - rdev->data_offset;
+ break;
+ }
+
+ return max(reshape_sectors, (sector_t) rs->data_offset);
+}
+
/* Calculate the sectors per device and per array used for @rs */
static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev)
{
@@ -1656,7 +1673,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, b
if (sector_div(dev_sectors, data_stripes))
goto bad;
- array_sectors = (data_stripes + delta_disks) * dev_sectors;
+ array_sectors = (data_stripes + delta_disks) * (dev_sectors - _get_reshape_sectors(rs));
if (sector_div(array_sectors, rs->raid10_copies))
goto bad;
@@ -1665,7 +1682,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, b
else
/* Striped layouts */
- array_sectors = (data_stripes + delta_disks) * dev_sectors;
+ array_sectors = (data_stripes + delta_disks) * (dev_sectors - _get_reshape_sectors(rs));
mddev->array_sectors = array_sectors;
mddev->dev_sectors = dev_sectors;
@@ -1704,11 +1721,20 @@ static void do_table_event(struct work_struct *ws)
struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
smp_rmb(); /* Make sure we access most actual mddev properties */
- if (!rs_is_reshaping(rs)) {
+
+ /* Only grow size resulting from added stripe(s) after reshape ended. */
+ if (!rs_is_reshaping(rs) &&
+ rs->array_sectors > rs->md.array_sectors &&
+ !rs->md.delta_disks &&
+ rs->md.raid_disks == rs->raid_disks) {
+ /* The raid10 personality doesn't provide proper device sizes -> correct. */
if (rs_is_raid10(rs))
rs_set_rdev_sectors(rs);
+
+ rs->md.array_sectors = rs->array_sectors;
rs_set_capacity(rs);
}
+
dm_table_event(rs->ti->table);
}
@@ -2493,7 +2519,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
rdev->saved_raid_disk = rdev->raid_disk;
}
- /* Reshape support -> restore repective data offsets */
+ /* Reshape support -> restore respective data offsets */
rdev->data_offset = le64_to_cpu(sb->data_offset);
rdev->new_data_offset = le64_to_cpu(sb->new_data_offset);
@@ -2811,23 +2837,6 @@ static int rs_prepare_reshape(struct raid_set *rs)
return 0;
}
-/* Get reshape sectors from data_offsets or raid set */
-static sector_t _get_reshape_sectors(struct raid_set *rs)
-{
- struct md_rdev *rdev;
- sector_t reshape_sectors = 0;
-
- rdev_for_each(rdev, &rs->md)
- if (!test_bit(Journal, &rdev->flags)) {
- reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ?
- rdev->data_offset - rdev->new_data_offset :
- rdev->new_data_offset - rdev->data_offset;
- break;
- }
-
- return max(reshape_sectors, (sector_t) rs->data_offset);
-}
-
/*
* Reshape:
* - change raid layout
@@ -3187,7 +3196,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (reshape_sectors || rs_is_raid1(rs)) {
/*
* We can only prepare for a reshape here, because the
- * raid set needs to run to provide the repective reshape
+ * raid set needs to run to provide the respective reshape
* check functions via its MD personality instance.
*
* So do the reshape check after md_run() succeeded.
@@ -3542,7 +3551,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
recovery = rs->md.recovery;
state = decipher_sync_action(mddev, recovery);
progress = rs_get_progress(rs, recovery, state, resync_max_sectors);
- resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
+ resync_mismatches = mddev->last_sync_action == ACTION_CHECK ?
atomic64_read(&mddev->resync_mismatches) : 0;
/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
@@ -3802,8 +3811,8 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
struct raid_set *rs = ti->private;
unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors);
- blk_limits_io_min(limits, chunk_size_bytes);
- blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
+ limits->io_min = chunk_size_bytes;
+ limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs);
}
static void raid_presuspend(struct dm_target *ti)
@@ -3940,7 +3949,9 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
/* Try loading the bitmap unless "raid0", which does not have one */
if (!rs_is_raid0(rs) &&
!test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
- r = md_bitmap_load(&rs->md);
+ struct mddev *mddev = &rs->md;
+
+ r = mddev->bitmap_ops->load(mddev);
if (r)
DMERR("Failed to load bitmap");
}
@@ -4023,6 +4034,11 @@ static int raid_preresume(struct dm_target *ti)
if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
return 0;
+ /* If different and no explicit grow request, expose MD array size as of superblock. */
+ if (!test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) &&
+ rs->array_sectors != mddev->array_sectors)
+ rs_set_capacity(rs);
+
/*
* The superblocks need to be updated on disk if the
* array is new or new devices got added (thus zeroed
@@ -4052,7 +4068,8 @@ static int raid_preresume(struct dm_target *ti)
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;
- r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0);
+ r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
+ chunksize, false);
if (r)
DMERR("Failed to resize bitmap");
}
@@ -4101,10 +4118,11 @@ static void raid_resume(struct dm_target *ti)
if (mddev->delta_disks < 0)
rs_set_capacity(rs);
+ mddev_lock_nointr(mddev);
WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery));
- WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+ WARN_ON_ONCE(rcu_dereference_protected(mddev->sync_thread,
+ lockdep_is_held(&mddev->reconfig_mutex)));
clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
- mddev_lock_nointr(mddev);
mddev->ro = 0;
mddev->in_sync = 0;
md_unfrozen_sync_thread(mddev);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9511dae5b556..8c6f1f7e6456 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -656,7 +656,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
unsigned int i;
struct dm_io_region io[MAX_NR_MIRRORS], *dest = io;
struct mirror *m;
- blk_opf_t op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH);
+ blk_opf_t op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH | REQ_ATOMIC);
struct dm_io_request io_req = {
.bi_opf = REQ_OP_WRITE | op_flags,
.mem.type = DM_IO_BIO,
@@ -1483,8 +1483,9 @@ static int mirror_iterate_devices(struct dm_target *ti,
static struct target_type mirror_target = {
.name = "mirror",
- .version = {1, 14, 0},
+ .version = {1, 15, 0},
.module = THIS_MODULE,
+ .features = DM_TARGET_ATOMIC_WRITES,
.ctr = mirror_ctr,
.dtr = mirror_dtr,
.map = mirror_map,
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index f7e9a3632eb3..e23076f7ece2 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -496,8 +496,10 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
map = dm_get_live_table(md, &srcu_idx);
if (unlikely(!map)) {
+ DMERR_LIMIT("%s: mapping table unavailable, erroring io",
+ dm_device_name(md));
dm_put_live_table(md, srcu_idx);
- return BLK_STS_RESOURCE;
+ return BLK_STS_IOERR;
}
ti = dm_table_find_target(map, 0);
dm_put_live_table(md, srcu_idx);
@@ -545,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
md->tag_set->ops = &dm_mq_ops;
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
md->tag_set->numa_node = md->numa_node_id;
- md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
+ md->tag_set->flags = BLK_MQ_F_STACKING;
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
md->tag_set->driver_data = md;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 0ace06d1bee3..f40c18da4000 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2410,7 +2410,7 @@ static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
/* All discards are split on chunk_size boundary */
limits->discard_granularity = snap->store->chunk_size;
- limits->max_discard_sectors = snap->store->chunk_size;
+ limits->max_hw_discard_sectors = snap->store->chunk_size;
up_read(&_origins_lock);
}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 16b93ae51d96..3786ac67cefe 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -157,6 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_discard_bios = stripes;
ti->num_secure_erase_bios = stripes;
ti->num_write_zeroes_bios = stripes;
+ ti->flush_bypasses_map = true;
sc->chunk_size = chunk_size;
if (chunk_size & (chunk_size - 1))
@@ -458,14 +459,15 @@ static void stripe_io_hints(struct dm_target *ti,
struct stripe_c *sc = ti->private;
unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT;
- blk_limits_io_min(limits, chunk_size);
- blk_limits_io_opt(limits, chunk_size * sc->stripes);
+ limits->io_min = chunk_size;
+ limits->io_opt = chunk_size * sc->stripes;
}
static struct target_type stripe_target = {
.name = "striped",
- .version = {1, 6, 0},
- .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT,
+ .version = {1, 7, 0},
+ .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
+ DM_TARGET_ATOMIC_WRITES,
.module = THIS_MODULE,
.ctr = stripe_ctr,
.dtr = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41f1d731ae5a..0ef5203387b2 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -160,6 +160,7 @@ int dm_table_create(struct dm_table **result, blk_mode_t mode,
t->type = DM_TYPE_NONE;
t->mode = mode;
t->md = md;
+ t->flush_bypasses_map = true;
*result = t;
return 0;
}
@@ -330,23 +331,15 @@ static int upgrade_mode(struct dm_dev_internal *dd, blk_mode_t new_mode,
}
/*
- * Add a device to the list, or just increment the usage count if
- * it's already present.
- *
* Note: the __ref annotation is because this function can call the __init
* marked early_lookup_bdev when called during early boot code from dm-init.c.
*/
-int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode,
- struct dm_dev **result)
+int __ref dm_devt_from_path(const char *path, dev_t *dev_p)
{
int r;
dev_t dev;
unsigned int major, minor;
char dummy;
- struct dm_dev_internal *dd;
- struct dm_table *t = ti->table;
-
- BUG_ON(!t);
if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
/* Extract the major/minor numbers */
@@ -362,6 +355,29 @@ int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode,
if (r)
return r;
}
+ *dev_p = dev;
+ return 0;
+}
+EXPORT_SYMBOL(dm_devt_from_path);
+
+/*
+ * Add a device to the list, or just increment the usage count if
+ * it's already present.
+ */
+int dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode,
+ struct dm_dev **result)
+{
+ int r;
+ dev_t dev;
+ struct dm_dev_internal *dd;
+ struct dm_table *t = ti->table;
+
+ BUG_ON(!t);
+
+ r = dm_devt_from_path(path, &dev);
+ if (r)
+ return r;
+
if (dev == disk_devt(t->md->disk))
return -EINVAL;
@@ -425,6 +441,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
q->limits.logical_block_size,
q->limits.alignment_offset,
(unsigned long long) start << SECTOR_SHIFT);
+
+ /*
+ * Only stack the integrity profile if the target doesn't have native
+ * integrity support.
+ */
+ if (!dm_target_has_integrity(ti->type))
+ queue_limits_stack_integrity_bdev(limits, bdev);
return 0;
}
@@ -572,6 +595,12 @@ int dm_split_args(int *argc, char ***argvp, char *input)
return 0;
}
+static void dm_set_stacking_limits(struct queue_limits *limits)
+{
+ blk_set_stacking_limits(limits);
+ limits->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL;
+}
+
/*
* Impose necessary and sufficient conditions on a devices's table such
* that any incoming bio which respects its logical_block_size can be
@@ -610,7 +639,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *t,
for (i = 0; i < t->num_targets; i++) {
ti = dm_table_get_target(t, i);
- blk_set_stacking_limits(&ti_limits);
+ dm_set_stacking_limits(&ti_limits);
/* combine all target devices' limits */
if (ti->type->iterate_devices)
@@ -702,9 +731,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
t->immutable_target_type = ti->type;
}
- if (dm_target_has_integrity(ti->type))
- t->integrity_added = 1;
-
ti->table = t;
ti->begin = start;
ti->len = len;
@@ -738,6 +764,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key))
static_branch_enable(&swap_bios_enabled);
+ if (!ti->flush_bypasses_map)
+ t->flush_bypasses_map = false;
+
return 0;
bad:
@@ -1004,24 +1033,19 @@ struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
return NULL;
}
-bool dm_table_bio_based(struct dm_table *t)
-{
- return __table_type_bio_based(dm_table_get_type(t));
-}
-
bool dm_table_request_based(struct dm_table *t)
{
return __table_type_request_based(dm_table_get_type(t));
}
-static bool dm_table_supports_poll(struct dm_table *t);
-
static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
{
enum dm_queue_mode type = dm_table_get_type(t);
unsigned int per_io_data_size = 0, front_pad, io_front_pad;
unsigned int min_pool_size = 0, pool_size;
struct dm_md_mempools *pools;
+ unsigned int bioset_flags = 0;
+ bool mempool_needs_integrity = t->integrity_supported;
if (unlikely(type == DM_TYPE_NONE)) {
DMERR("no table type is set, can't allocate mempools");
@@ -1038,11 +1062,16 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
goto init_bs;
}
+ if (md->queue->limits.features & BLK_FEAT_POLL)
+ bioset_flags |= BIOSET_PERCPU_CACHE;
+
for (unsigned int i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
per_io_data_size = max(per_io_data_size, ti->per_io_data_size);
min_pool_size = max(min_pool_size, ti->num_flush_bios);
+
+ mempool_needs_integrity |= ti->mempool_needs_integrity;
}
pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
front_pad = roundup(per_io_data_size,
@@ -1050,16 +1079,15 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
io_front_pad = roundup(per_io_data_size,
__alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
- if (bioset_init(&pools->io_bs, pool_size, io_front_pad,
- dm_table_supports_poll(t) ? BIOSET_PERCPU_CACHE : 0))
+ if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags))
goto out_free_pools;
- if (t->integrity_supported &&
+ if (mempool_needs_integrity &&
bioset_integrity_create(&pools->io_bs, pool_size))
goto out_free_pools;
init_bs:
if (bioset_init(&pools->bs, pool_size, front_pad, 0))
goto out_free_pools;
- if (t->integrity_supported &&
+ if (mempool_needs_integrity &&
bioset_integrity_create(&pools->bs, pool_size))
goto out_free_pools;
@@ -1119,99 +1147,6 @@ static int dm_table_build_index(struct dm_table *t)
return r;
}
-static bool integrity_profile_exists(struct gendisk *disk)
-{
- return !!blk_get_integrity(disk);
-}
-
-/*
- * Get a disk whose integrity profile reflects the table's profile.
- * Returns NULL if integrity support was inconsistent or unavailable.
- */
-static struct gendisk *dm_table_get_integrity_disk(struct dm_table *t)
-{
- struct list_head *devices = dm_table_get_devices(t);
- struct dm_dev_internal *dd = NULL;
- struct gendisk *prev_disk = NULL, *template_disk = NULL;
-
- for (unsigned int i = 0; i < t->num_targets; i++) {
- struct dm_target *ti = dm_table_get_target(t, i);
-
- if (!dm_target_passes_integrity(ti->type))
- goto no_integrity;
- }
-
- list_for_each_entry(dd, devices, list) {
- template_disk = dd->dm_dev->bdev->bd_disk;
- if (!integrity_profile_exists(template_disk))
- goto no_integrity;
- else if (prev_disk &&
- blk_integrity_compare(prev_disk, template_disk) < 0)
- goto no_integrity;
- prev_disk = template_disk;
- }
-
- return template_disk;
-
-no_integrity:
- if (prev_disk)
- DMWARN("%s: integrity not set: %s and %s profile mismatch",
- dm_device_name(t->md),
- prev_disk->disk_name,
- template_disk->disk_name);
- return NULL;
-}
-
-/*
- * Register the mapped device for blk_integrity support if the
- * underlying devices have an integrity profile. But all devices may
- * not have matching profiles (checking all devices isn't reliable
- * during table load because this table may use other DM device(s) which
- * must be resumed before they will have an initialized integity
- * profile). Consequently, stacked DM devices force a 2 stage integrity
- * profile validation: First pass during table load, final pass during
- * resume.
- */
-static int dm_table_register_integrity(struct dm_table *t)
-{
- struct mapped_device *md = t->md;
- struct gendisk *template_disk = NULL;
-
- /* If target handles integrity itself do not register it here. */
- if (t->integrity_added)
- return 0;
-
- template_disk = dm_table_get_integrity_disk(t);
- if (!template_disk)
- return 0;
-
- if (!integrity_profile_exists(dm_disk(md))) {
- t->integrity_supported = true;
- /*
- * Register integrity profile during table load; we can do
- * this because the final profile must match during resume.
- */
- blk_integrity_register(dm_disk(md),
- blk_get_integrity(template_disk));
- return 0;
- }
-
- /*
- * If DM device already has an initialized integrity
- * profile the new profile should not conflict.
- */
- if (blk_integrity_compare(dm_disk(md), template_disk) < 0) {
- DMERR("%s: conflict with existing integrity profile: %s profile mismatch",
- dm_device_name(t->md),
- template_disk->disk_name);
- return 1;
- }
-
- /* Preserve existing integrity profile */
- t->integrity_supported = true;
- return 0;
-}
-
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
struct dm_crypto_profile {
@@ -1423,12 +1358,6 @@ int dm_table_complete(struct dm_table *t)
return r;
}
- r = dm_table_register_integrity(t);
- if (r) {
- DMERR("could not register integrity profile.");
- return r;
- }
-
r = dm_table_construct_crypto_profile(t);
if (r) {
DMERR("could not construct crypto profile.");
@@ -1493,14 +1422,6 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
return &t->targets[(KEYS_PER_NODE * n) + k];
}
-static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
-{
- struct request_queue *q = bdev_get_queue(dev->bdev);
-
- return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags);
-}
-
/*
* type->iterate_devices() should be called when the sanity check needs to
* iterate and check all underlying data devices. iterate_devices() will
@@ -1548,19 +1469,6 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
return 0;
}
-static bool dm_table_supports_poll(struct dm_table *t)
-{
- for (unsigned int i = 0; i < t->num_targets; i++) {
- struct dm_target *ti = dm_table_get_target(t, i);
-
- if (!ti->type->iterate_devices ||
- ti->type->iterate_devices(ti, device_not_poll_capable, NULL))
- return false;
- }
-
- return true;
-}
-
/*
* Check whether a table has no data devices attached using each
* target's iterate_devices method.
@@ -1686,12 +1594,20 @@ int dm_calculate_queue_limits(struct dm_table *t,
unsigned int zone_sectors = 0;
bool zoned = false;
- blk_set_stacking_limits(limits);
+ dm_set_stacking_limits(limits);
+
+ t->integrity_supported = true;
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (!dm_target_passes_integrity(ti->type))
+ t->integrity_supported = false;
+ }
for (unsigned int i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
- blk_set_stacking_limits(&ti_limits);
+ dm_set_stacking_limits(&ti_limits);
if (!ti->type->iterate_devices) {
/* Set I/O hints portion of queue limits */
@@ -1706,12 +1622,12 @@ int dm_calculate_queue_limits(struct dm_table *t,
ti->type->iterate_devices(ti, dm_set_device_limits,
&ti_limits);
- if (!zoned && ti_limits.zoned) {
+ if (!zoned && (ti_limits.features & BLK_FEAT_ZONED)) {
/*
* After stacking all limits, validate all devices
* in table support this zoned model and zone sectors.
*/
- zoned = ti_limits.zoned;
+ zoned = (ti_limits.features & BLK_FEAT_ZONED);
zone_sectors = ti_limits.chunk_sectors;
}
@@ -1738,6 +1654,18 @@ combine_limits:
dm_device_name(t->md),
(unsigned long long) ti->begin,
(unsigned long long) ti->len);
+
+ if (t->integrity_supported ||
+ dm_target_has_integrity(ti->type)) {
+ if (!queue_limits_stack_integrity(limits, &ti_limits)) {
+ DMWARN("%s: adding target device (start sect %llu len %llu) "
+ "disabled integrity support due to incompatibility",
+ dm_device_name(t->md),
+ (unsigned long long) ti->begin,
+ (unsigned long long) ti->len);
+ t->integrity_supported = false;
+ }
+ }
}
/*
@@ -1747,12 +1675,12 @@ combine_limits:
* zoned model on host-managed zoned block devices.
* BUT...
*/
- if (limits->zoned) {
+ if (limits->features & BLK_FEAT_ZONED) {
/*
* ...IF the above limits stacking determined a zoned model
* validate that all of the table's devices conform to it.
*/
- zoned = limits->zoned;
+ zoned = limits->features & BLK_FEAT_ZONED;
zone_sectors = limits->chunk_sectors;
}
if (validate_hardware_zoned(t, zoned, zone_sectors))
@@ -1762,63 +1690,15 @@ combine_limits:
}
/*
- * Verify that all devices have an integrity profile that matches the
- * DM device's registered integrity profile. If the profiles don't
- * match then unregister the DM device's integrity profile.
+ * Check if a target requires flush support even if none of the underlying
+ * devices need it (e.g. to persist target-specific metadata).
*/
-static void dm_table_verify_integrity(struct dm_table *t)
-{
- struct gendisk *template_disk = NULL;
-
- if (t->integrity_added)
- return;
-
- if (t->integrity_supported) {
- /*
- * Verify that the original integrity profile
- * matches all the devices in this table.
- */
- template_disk = dm_table_get_integrity_disk(t);
- if (template_disk &&
- blk_integrity_compare(dm_disk(t->md), template_disk) >= 0)
- return;
- }
-
- if (integrity_profile_exists(dm_disk(t->md))) {
- DMWARN("%s: unable to establish an integrity profile",
- dm_device_name(t->md));
- blk_integrity_unregister(dm_disk(t->md));
- }
-}
-
-static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
-{
- unsigned long flush = (unsigned long) data;
- struct request_queue *q = bdev_get_queue(dev->bdev);
-
- return (q->queue_flags & flush);
-}
-
-static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
+static bool dm_table_supports_flush(struct dm_table *t)
{
- /*
- * Require at least one underlying device to support flushes.
- * t->devices includes internal dm devices such as mirror logs
- * so we need to use iterate_devices here, which targets
- * supporting flushes must provide.
- */
for (unsigned int i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
- if (!ti->num_flush_bios)
- continue;
-
- if (ti->flush_supported)
- return true;
-
- if (ti->type->iterate_devices &&
- ti->type->iterate_devices(ti, device_flush_capable, (void *) flush))
+ if (ti->num_flush_bios && ti->flush_supported)
return true;
}
@@ -1839,20 +1719,6 @@ static int device_dax_write_cache_enabled(struct dm_target *ti,
return false;
}
-static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
-{
- return !bdev_nonrot(dev->bdev);
-}
-
-static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
-{
- struct request_queue *q = bdev_get_queue(dev->bdev);
-
- return !blk_queue_add_random(q);
-}
-
static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1877,12 +1743,6 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t)
return true;
}
-static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
-{
- return !bdev_nowait(dev->bdev);
-}
-
static bool dm_table_supports_nowait(struct dm_table *t)
{
for (unsigned int i = 0; i < t->num_targets; i++) {
@@ -1890,10 +1750,6 @@ static bool dm_table_supports_nowait(struct dm_table *t)
if (!dm_target_supports_nowait(ti->type))
return false;
-
- if (!ti->type->iterate_devices ||
- ti->type->iterate_devices(ti, device_not_nowait_capable, NULL))
- return false;
}
return true;
@@ -1950,119 +1806,99 @@ static bool dm_table_supports_secure_erase(struct dm_table *t)
return true;
}
-static int device_requires_stable_pages(struct dm_target *ti,
- struct dm_dev *dev, sector_t start,
- sector_t len, void *data)
+static int device_not_atomic_write_capable(struct dm_target *ti,
+ struct dm_dev *dev, sector_t start,
+ sector_t len, void *data)
+{
+ return !bdev_can_atomic_write(dev->bdev);
+}
+
+static bool dm_table_supports_atomic_writes(struct dm_table *t)
{
- return bdev_stable_writes(dev->bdev);
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (!dm_target_supports_atomic_writes(ti->type))
+ return false;
+
+ if (!ti->type->iterate_devices)
+ return false;
+
+ if (ti->type->iterate_devices(ti,
+ device_not_atomic_write_capable, NULL)) {
+ return false;
+ }
+ }
+ return true;
}
int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
- bool wc = false, fua = false;
int r;
+ if (!dm_table_supports_nowait(t))
+ limits->features &= ~BLK_FEAT_NOWAIT;
+
/*
- * Copy table's limits to the DM device's request_queue
+ * The current polling impementation does not support request based
+ * stacking.
*/
- q->limits = *limits;
-
- if (dm_table_supports_nowait(t))
- blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
- else
- blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q);
+ if (!__table_type_bio_based(t->type))
+ limits->features &= ~BLK_FEAT_POLL;
if (!dm_table_supports_discards(t)) {
- q->limits.max_discard_sectors = 0;
- q->limits.max_hw_discard_sectors = 0;
- q->limits.discard_granularity = 0;
- q->limits.discard_alignment = 0;
- q->limits.discard_misaligned = 0;
+ limits->max_hw_discard_sectors = 0;
+ limits->discard_granularity = 0;
+ limits->discard_alignment = 0;
}
+ if (!dm_table_supports_write_zeroes(t))
+ limits->max_write_zeroes_sectors = 0;
+
if (!dm_table_supports_secure_erase(t))
- q->limits.max_secure_erase_sectors = 0;
+ limits->max_secure_erase_sectors = 0;
- if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
- wc = true;
- if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA)))
- fua = true;
- }
- blk_queue_write_cache(q, wc, fua);
+ if (dm_table_supports_flush(t))
+ limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
if (dm_table_supports_dax(t, device_not_dax_capable)) {
- blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+ limits->features |= BLK_FEAT_DAX;
if (dm_table_supports_dax(t, device_not_dax_synchronous_capable))
set_dax_synchronous(t->md->dax_dev);
} else
- blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
+ limits->features &= ~BLK_FEAT_DAX;
if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL))
dax_write_cache(t->md->dax_dev, true);
- /* Ensure that all underlying devices are non-rotational. */
- if (dm_table_any_dev_attr(t, device_is_rotational, NULL))
- blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
- else
- blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
-
- if (!dm_table_supports_write_zeroes(t))
- q->limits.max_write_zeroes_sectors = 0;
+ /* For a zoned table, setup the zone related queue attributes. */
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ (limits->features & BLK_FEAT_ZONED)) {
+ r = dm_set_zones_restrictions(t, q, limits);
+ if (r)
+ return r;
+ }
- dm_table_verify_integrity(t);
+ if (dm_table_supports_atomic_writes(t))
+ limits->features |= BLK_FEAT_ATOMIC_WRITES;
- /*
- * Some devices don't use blk_integrity but still want stable pages
- * because they do their own checksumming.
- * If any underlying device requires stable pages, a table must require
- * them as well. Only targets that support iterate_devices are considered:
- * don't want error, zero, etc to require stable pages.
- */
- if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL))
- blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
- else
- blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
-
- /*
- * Determine whether or not this queue's I/O timings contribute
- * to the entropy pool, Only request-based targets use this.
- * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
- * have it set.
- */
- if (blk_queue_add_random(q) &&
- dm_table_any_dev_attr(t, device_is_not_random, NULL))
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
+ r = queue_limits_set(q, limits);
+ if (r)
+ return r;
/*
- * For a zoned target, setup the zones related queue attributes
- * and resources necessary for zone append emulation if necessary.
+ * Now that the limits are set, check the zones mapped by the table
+ * and setup the resources for zone append emulation if necessary.
*/
- if (blk_queue_is_zoned(q)) {
- r = dm_set_zones_restrictions(t, q);
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ (limits->features & BLK_FEAT_ZONED)) {
+ r = dm_revalidate_zones(t, q);
if (r)
return r;
- if (!static_key_enabled(&zoned_enabled.key))
- static_branch_enable(&zoned_enabled);
}
dm_update_crypto_profile(q, t);
- disk_update_readahead(t->md->disk);
-
- /*
- * Check for request-based device is left to
- * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
- *
- * For bio-based device, only set QUEUE_FLAG_POLL when all
- * underlying devices supporting polling.
- */
- if (__table_type_bio_based(t->type)) {
- if (dm_table_supports_poll(t))
- blk_queue_flag_set(QUEUE_FLAG_POLL, q);
- else
- blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
- }
-
return 0;
}
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 0c4efb0bef8a..652627aea11b 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -249,7 +249,6 @@ static int io_err_iterate_devices(struct dm_target *ti,
static void io_err_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
- limits->max_discard_sectors = UINT_MAX;
limits->max_hw_discard_sectors = UINT_MAX;
limits->discard_granularity = 512;
}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 6022189c1388..f90679cfec5b 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -249,7 +249,7 @@ struct dm_thin_device {
*/
#define SUPERBLOCK_CSUM_XOR 160774
-static void sb_prepare_for_write(struct dm_block_validator *v,
+static void sb_prepare_for_write(const struct dm_block_validator *v,
struct dm_block *b,
size_t block_size)
{
@@ -261,7 +261,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v,
SUPERBLOCK_CSUM_XOR));
}
-static int sb_check(struct dm_block_validator *v,
+static int sb_check(const struct dm_block_validator *v,
struct dm_block *b,
size_t block_size)
{
@@ -294,7 +294,7 @@ static int sb_check(struct dm_block_validator *v,
return 0;
}
-static struct dm_block_validator sb_validator = {
+static const struct dm_block_validator sb_validator = {
.name = "superblock",
.prepare_for_write = sb_prepare_for_write,
.check = sb_check
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 4793ad2aa1f7..05cf4e3f2bbe 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -592,12 +592,6 @@ struct dm_thin_endio_hook {
struct dm_bio_prison_cell *cell;
};
-static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
-{
- bio_list_merge(bios, master);
- bio_list_init(master);
-}
-
static void error_bio_list(struct bio_list *bios, blk_status_t error)
{
struct bio *bio;
@@ -616,7 +610,7 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
bio_list_init(&bios);
spin_lock_irq(&tc->lock);
- __merge_bio_list(&bios, master);
+ bio_list_merge_init(&bios, master);
spin_unlock_irq(&tc->lock);
error_bio_list(&bios, error);
@@ -645,8 +639,8 @@ static void requeue_io(struct thin_c *tc)
bio_list_init(&bios);
spin_lock_irq(&tc->lock);
- __merge_bio_list(&bios, &tc->deferred_bio_list);
- __merge_bio_list(&bios, &tc->retry_on_resume_list);
+ bio_list_merge_init(&bios, &tc->deferred_bio_list);
+ bio_list_merge_init(&bios, &tc->retry_on_resume_list);
spin_unlock_irq(&tc->lock);
error_bio_list(&bios, BLK_STS_DM_REQUEUE);
@@ -2338,10 +2332,9 @@ static struct thin_c *get_first_thin(struct pool *pool)
struct thin_c *tc = NULL;
rcu_read_lock();
- if (!list_empty(&pool->active_thins)) {
- tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
+ tc = list_first_or_null_rcu(&pool->active_thins, struct thin_c, list);
+ if (tc)
thin_get(tc);
- }
rcu_read_unlock();
return tc;
@@ -2490,6 +2483,7 @@ static void pool_work_wait(struct pool_work *pw, struct pool *pool,
init_completion(&pw->complete);
queue_work(pool->wq, &pw->worker);
wait_for_completion(&pw->complete);
+ destroy_work_on_stack(&pw->worker);
}
/*----------------------------------------------------------------*/
@@ -2848,7 +2842,7 @@ static void disable_discard_passdown_if_not_supported(struct pool_c *pt)
{
struct pool *pool = pt->pool;
struct block_device *data_bdev = pt->data_dev->bdev;
- struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
+ struct queue_limits *data_limits = bdev_limits(data_bdev);
const char *reason = NULL;
if (!pt->adjusted_pf.discard_passdown)
@@ -2954,7 +2948,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
if (IS_ERR(pmd)) {
*error = "Error creating metadata object";
- return (struct pool *)pmd;
+ return ERR_CAST(pmd);
}
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
@@ -4085,10 +4079,10 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
if (io_opt_sectors < pool->sectors_per_block ||
!is_factor(io_opt_sectors, pool->sectors_per_block)) {
if (is_factor(pool->sectors_per_block, limits->max_sectors))
- blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
+ limits->io_min = limits->max_sectors << SECTOR_SHIFT;
else
- blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
- blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ limits->io_min = pool->sectors_per_block << SECTOR_SHIFT;
+ limits->io_opt = pool->sectors_per_block << SECTOR_SHIFT;
}
/*
@@ -4100,7 +4094,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
if (pt->adjusted_pf.discard_enabled) {
disable_discard_passdown_if_not_supported(pt);
if (!pt->adjusted_pf.discard_passdown)
- limits->max_discard_sectors = 0;
+ limits->max_hw_discard_sectors = 0;
/*
* The pool uses the same discard limits as the underlying data
* device. DM core has already set this up.
@@ -4497,7 +4491,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
if (pool->pf.discard_enabled) {
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
- limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
+ limits->max_hw_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
}
}
diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c
index 48587c16c445..e8a9432057dc 100644
--- a/drivers/md/dm-unstripe.c
+++ b/drivers/md/dm-unstripe.c
@@ -85,8 +85,8 @@ static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
uc->physical_start = start;
- uc->unstripe_offset = uc->unstripe * uc->chunk_size;
- uc->unstripe_width = (uc->stripes - 1) * uc->chunk_size;
+ uc->unstripe_offset = (sector_t)uc->unstripe * uc->chunk_size;
+ uc->unstripe_width = (sector_t)(uc->stripes - 1) * uc->chunk_size;
uc->chunk_shift = is_power_of_2(uc->chunk_size) ? fls(uc->chunk_size) - 1 : 0;
tmp_len = ti->len;
diff --git a/drivers/md/dm-vdo/Kconfig b/drivers/md/dm-vdo/Kconfig
index 111ecd2c2a24..2400b2bc4bc7 100644
--- a/drivers/md/dm-vdo/Kconfig
+++ b/drivers/md/dm-vdo/Kconfig
@@ -7,6 +7,7 @@ config DM_VDO
select DM_BUFIO
select LZ4_COMPRESS
select LZ4_DECOMPRESS
+ select MIN_HEAP
help
This device mapper target presents a block device with
deduplication, compression and thin-provisioning.
diff --git a/drivers/md/dm-vdo/Makefile b/drivers/md/dm-vdo/Makefile
index 33e09abc6acd..9476957bfbf4 100644
--- a/drivers/md/dm-vdo/Makefile
+++ b/drivers/md/dm-vdo/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
-ccflags-y := -I$(srctree)/$(src) -I$(srctree)/$(src)/indexer
+ccflags-y := -I$(src) -I$(src)/indexer
obj-$(CONFIG_DM_VDO) += dm-vdo.o
diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index a0a7c1bd634e..89cb7942ec5c 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -209,8 +209,6 @@ static int initialize_info(struct vdo_page_cache *cache)
/**
* allocate_cache_components() - Allocate components of the cache which require their own
* allocation.
- * @maximum_age: The number of journal blocks before a dirtied page is considered old and must be
- * written out.
*
* The caller is responsible for all clean up on errors.
*
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index 94f6f1ccfb7d..810002747091 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -327,8 +327,9 @@ static u32 __must_check pack_status(struct data_vio_compression_status status)
/**
* set_data_vio_compression_status() - Set the compression status of a data_vio.
- * @state: The expected current status of the data_vio.
- * @new_state: The status to set.
+ * @data_vio: The data_vio to change.
+ * @status: The expected current status of the data_vio.
+ * @new_status: The status to set.
*
* Return: true if the new status was set, false if the data_vio's compression status did not
* match the expected state, and so was left unchanged.
@@ -501,6 +502,7 @@ static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lb
memset(&data_vio->record_name, 0, sizeof(data_vio->record_name));
memset(&data_vio->duplicate, 0, sizeof(data_vio->duplicate));
+ vdo_reset_completion(&data_vio->decrement_completion);
vdo_reset_completion(completion);
completion->error_handler = handle_data_vio_error;
set_data_vio_logical_callback(data_vio, attempt_logical_block_lock);
@@ -604,8 +606,7 @@ static void assign_discard_permit(struct limiter *limiter)
static void get_waiters(struct limiter *limiter)
{
- bio_list_merge(&limiter->waiters, &limiter->new_waiters);
- bio_list_init(&limiter->new_waiters);
+ bio_list_merge_init(&limiter->waiters, &limiter->new_waiters);
}
static inline struct data_vio *get_available_data_vio(struct data_vio_pool *pool)
@@ -836,7 +837,7 @@ static void destroy_data_vio(struct data_vio *data_vio)
* @vdo: The vdo to which the pool will belong.
* @pool_size: The number of data_vios in the pool.
* @discard_limit: The maximum number of data_vios which may be used for discards.
- * @pool: A pointer to hold the newly allocated pool.
+ * @pool_ptr: A pointer to hold the newly allocated pool.
*/
int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size,
data_vio_count_t discard_limit, struct data_vio_pool **pool_ptr)
@@ -1074,35 +1075,6 @@ void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios)
spin_unlock(&pool->lock);
}
-data_vio_count_t get_data_vio_pool_active_discards(struct data_vio_pool *pool)
-{
- return READ_ONCE(pool->discard_limiter.busy);
-}
-
-data_vio_count_t get_data_vio_pool_discard_limit(struct data_vio_pool *pool)
-{
- return READ_ONCE(pool->discard_limiter.limit);
-}
-
-data_vio_count_t get_data_vio_pool_maximum_discards(struct data_vio_pool *pool)
-{
- return READ_ONCE(pool->discard_limiter.max_busy);
-}
-
-int set_data_vio_pool_discard_limit(struct data_vio_pool *pool, data_vio_count_t limit)
-{
- if (get_data_vio_pool_request_limit(pool) < limit) {
- // The discard limit may not be higher than the data_vio limit.
- return -EINVAL;
- }
-
- spin_lock(&pool->lock);
- pool->discard_limiter.limit = limit;
- spin_unlock(&pool->lock);
-
- return VDO_SUCCESS;
-}
-
data_vio_count_t get_data_vio_pool_active_requests(struct data_vio_pool *pool)
{
return READ_ONCE(pool->limiter.busy);
@@ -1274,12 +1246,14 @@ static void clean_hash_lock(struct vdo_completion *completion)
static void finish_cleanup(struct data_vio *data_vio)
{
struct vdo_completion *completion = &data_vio->vio.completion;
+ u32 discard_size = min_t(u32, data_vio->remaining_discard,
+ VDO_BLOCK_SIZE - data_vio->offset);
VDO_ASSERT_LOG_ONLY(data_vio->allocation.lock == NULL,
"complete data_vio has no allocation lock");
VDO_ASSERT_LOG_ONLY(data_vio->hash_lock == NULL,
"complete data_vio has no hash lock");
- if ((data_vio->remaining_discard <= VDO_BLOCK_SIZE) ||
+ if ((data_vio->remaining_discard <= discard_size) ||
(completion->result != VDO_SUCCESS)) {
struct data_vio_pool *pool = completion->vdo->data_vio_pool;
@@ -1288,12 +1262,12 @@ static void finish_cleanup(struct data_vio *data_vio)
return;
}
- data_vio->remaining_discard -= min_t(u32, data_vio->remaining_discard,
- VDO_BLOCK_SIZE - data_vio->offset);
+ data_vio->remaining_discard -= discard_size;
data_vio->is_partial = (data_vio->remaining_discard < VDO_BLOCK_SIZE);
data_vio->read = data_vio->is_partial;
data_vio->offset = 0;
completion->requeue = true;
+ data_vio->first_reference_operation_complete = false;
launch_data_vio(data_vio, data_vio->logical.lbn + 1);
}
@@ -1966,7 +1940,8 @@ static void allocate_block(struct vdo_completion *completion)
.state = VDO_MAPPING_STATE_UNCOMPRESSED,
};
- if (data_vio->fua) {
+ if (data_vio->fua ||
+ data_vio->remaining_discard > (u32) (VDO_BLOCK_SIZE - data_vio->offset)) {
prepare_for_dedupe(data_vio);
return;
}
@@ -2043,7 +2018,6 @@ void continue_data_vio_with_block_map_slot(struct vdo_completion *completion)
return;
}
-
/*
* We don't need to write any data, so skip allocation and just update the block map and
* reference counts (via the journal).
@@ -2052,7 +2026,7 @@ void continue_data_vio_with_block_map_slot(struct vdo_completion *completion)
if (data_vio->is_zero)
data_vio->new_mapped.state = VDO_MAPPING_STATE_UNCOMPRESSED;
- if (data_vio->remaining_discard > VDO_BLOCK_SIZE) {
+ if (data_vio->remaining_discard > (u32) (VDO_BLOCK_SIZE - data_vio->offset)) {
/* This is not the final block of a discard so we can't acknowledge it yet. */
update_metadata_for_data_vio_write(data_vio, NULL);
return;
diff --git a/drivers/md/dm-vdo/data-vio.h b/drivers/md/dm-vdo/data-vio.h
index 25926b6cd98b..067b983bb291 100644
--- a/drivers/md/dm-vdo/data-vio.h
+++ b/drivers/md/dm-vdo/data-vio.h
@@ -336,11 +336,6 @@ void drain_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *comp
void resume_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion);
void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios);
-data_vio_count_t get_data_vio_pool_active_discards(struct data_vio_pool *pool);
-data_vio_count_t get_data_vio_pool_discard_limit(struct data_vio_pool *pool);
-data_vio_count_t get_data_vio_pool_maximum_discards(struct data_vio_pool *pool);
-int __must_check set_data_vio_pool_discard_limit(struct data_vio_pool *pool,
- data_vio_count_t limit);
data_vio_count_t get_data_vio_pool_active_requests(struct data_vio_pool *pool);
data_vio_count_t get_data_vio_pool_request_limit(struct data_vio_pool *pool);
data_vio_count_t get_data_vio_pool_maximum_requests(struct data_vio_pool *pool);
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index 117266e1b3ae..b6f8e2dc7729 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -148,11 +148,6 @@
#include "vdo.h"
#include "wait-queue.h"
-struct uds_attribute {
- struct attribute attr;
- const char *(*show_string)(struct hash_zones *hash_zones);
-};
-
#define DEDUPE_QUERY_TIMER_IDLE 0
#define DEDUPE_QUERY_TIMER_RUNNING 1
#define DEDUPE_QUERY_TIMER_FIRED 2
@@ -570,7 +565,7 @@ static void wait_on_hash_lock(struct hash_lock *lock, struct data_vio *data_vio)
* @waiter: The data_vio's waiter link.
* @context: Not used.
*/
-static void abort_waiter(struct vdo_waiter *waiter, void *context __always_unused)
+static void abort_waiter(struct vdo_waiter *waiter, void __always_unused *context)
{
write_data_vio(vdo_waiter_as_data_vio(waiter));
}
@@ -734,6 +729,7 @@ static void process_update_result(struct data_vio *agent)
!change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE))
return;
+ agent->dedupe_context = NULL;
release_context(context);
}
@@ -1653,6 +1649,7 @@ static void process_query_result(struct data_vio *agent)
if (change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE)) {
agent->is_duplicate = decode_uds_advice(context);
+ agent->dedupe_context = NULL;
release_context(context);
}
}
@@ -1730,7 +1727,7 @@ static void report_bogus_lock_state(struct hash_lock *lock, struct data_vio *dat
/**
* vdo_continue_hash_lock() - Continue the processing state after writing, compressing, or
* deduplicating.
- * @data_vio: The data_vio to continue processing in its hash lock.
+ * @completion: The data_vio completion to continue processing in its hash lock.
*
* Asynchronously continue processing a data_vio in its hash lock after it has finished writing,
* compressing, or deduplicating, so it can share the result with any data_vios waiting in the hash
@@ -1828,7 +1825,7 @@ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio
/**
* vdo_acquire_hash_lock() - Acquire or share a lock on a record name.
- * @data_vio: The data_vio acquiring a lock on its record name.
+ * @completion: The data_vio completion acquiring a lock on its record name.
*
* Acquire or share a lock on the hash (record name) of the data in a data_vio, updating the
* data_vio to reference the lock. This must only be called in the correct thread for the zone. In
@@ -2326,6 +2323,7 @@ static void timeout_index_operations_callback(struct vdo_completion *completion)
* send its requestor on its way.
*/
list_del_init(&context->list_entry);
+ context->requestor->dedupe_context = NULL;
continue_data_vio(context->requestor);
timed_out++;
}
@@ -2681,7 +2679,8 @@ static void get_index_statistics(struct hash_zones *zones,
/**
* vdo_get_dedupe_statistics() - Tally the statistics from all the hash zones and the UDS index.
- * @hash_zones: The hash zones to query
+ * @zones: The hash zones to query
+ * @stats: A structure to store the statistics
*
* Return: The sum of the hash lock statistics from all hash zones plus the statistics from the UDS
* index
diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index 5a4b0a927f56..0e04c2021682 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -878,7 +878,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti,
}
if (config->version == 0) {
- u64 device_size = i_size_read(config->owned_device->bdev->bd_inode);
+ u64 device_size = bdev_nr_bytes(config->owned_device->bdev);
config->physical_blocks = device_size / VDO_BLOCK_SIZE;
}
@@ -928,9 +928,9 @@ static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits)
limits->physical_block_size = VDO_BLOCK_SIZE;
/* The minimum io size for random io */
- blk_limits_io_min(limits, VDO_BLOCK_SIZE);
+ limits->io_min = VDO_BLOCK_SIZE;
/* The optimal io size for streamed/sequential io */
- blk_limits_io_opt(limits, VDO_BLOCK_SIZE);
+ limits->io_opt = VDO_BLOCK_SIZE;
/*
* Sets the maximum discard size that will be passed into VDO. This value comes from a
@@ -945,7 +945,7 @@ static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits)
* The value is used by dm-thin to determine whether to pass down discards. The block layer
* splits large discards on this boundary when this is set.
*/
- limits->max_discard_sectors =
+ limits->max_hw_discard_sectors =
(vdo->device_config->max_discard_blocks * VDO_SECTORS_PER_BLOCK);
/*
@@ -1011,7 +1011,7 @@ static void vdo_status(struct dm_target *ti, status_type_t status_type,
static block_count_t __must_check get_underlying_device_block_count(const struct vdo *vdo)
{
- return i_size_read(vdo_get_backing_device(vdo)->bd_inode) / VDO_BLOCK_SIZE;
+ return bdev_nr_bytes(vdo_get_backing_device(vdo)) / VDO_BLOCK_SIZE;
}
static int __must_check process_vdo_message_locked(struct vdo *vdo, unsigned int argc,
@@ -1105,6 +1105,9 @@ static int vdo_message(struct dm_target *ti, unsigned int argc, char **argv,
if ((argc == 1) && (strcasecmp(argv[0], "stats") == 0)) {
vdo_write_stats(vdo, result_buffer, maxlen);
result = 1;
+ } else if ((argc == 1) && (strcasecmp(argv[0], "config") == 0)) {
+ vdo_write_config(vdo, &result_buffer, &maxlen);
+ result = 1;
} else {
result = vdo_status_to_errno(process_vdo_message(vdo, argc, argv));
}
@@ -2293,6 +2296,14 @@ static void handle_load_error(struct vdo_completion *completion)
return;
}
+ if ((completion->result == VDO_UNSUPPORTED_VERSION) &&
+ (vdo->admin.phase == LOAD_PHASE_MAKE_DIRTY)) {
+ vdo_log_error("Aborting load due to unsupported version");
+ vdo->admin.phase = LOAD_PHASE_FINISHED;
+ load_callback(completion);
+ return;
+ }
+
vdo_log_error_strerror(completion->result,
"Entering read-only mode due to load error");
vdo->admin.phase = LOAD_PHASE_WAIT_FOR_READ_ONLY;
@@ -2737,6 +2748,19 @@ static int vdo_preresume_registered(struct dm_target *ti, struct vdo *vdo)
vdo_log_info("starting device '%s'", device_name);
result = perform_admin_operation(vdo, LOAD_PHASE_START, load_callback,
handle_load_error, "load");
+ if (result == VDO_UNSUPPORTED_VERSION) {
+ /*
+ * A component version is not supported. This can happen when the
+ * recovery journal metadata is in an old version format. Abort the
+ * load without saving the state.
+ */
+ vdo->suspend_type = VDO_ADMIN_STATE_SUSPENDING;
+ perform_admin_operation(vdo, SUSPEND_PHASE_START,
+ suspend_callback, suspend_callback,
+ "suspend");
+ return result;
+ }
+
if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
/*
* Something has gone very wrong. Make sure everything has drained and
@@ -2808,7 +2832,8 @@ static int vdo_preresume(struct dm_target *ti)
vdo_register_thread_device_id(&instance_thread, &vdo->instance);
result = vdo_preresume_registered(ti, vdo);
- if ((result == VDO_PARAMETER_MISMATCH) || (result == VDO_INVALID_ADMIN_STATE))
+ if ((result == VDO_PARAMETER_MISMATCH) || (result == VDO_INVALID_ADMIN_STATE) ||
+ (result == VDO_UNSUPPORTED_VERSION))
result = -EINVAL;
vdo_unregister_thread_device_id();
return vdo_status_to_errno(result);
@@ -2832,7 +2857,7 @@ static void vdo_resume(struct dm_target *ti)
static struct target_type vdo_target_bio = {
.features = DM_TARGET_SINGLETON,
.name = "vdo",
- .version = { 9, 0, 0 },
+ .version = { 9, 1, 0 },
.module = THIS_MODULE,
.ctr = vdo_ctr,
.dtr = vdo_dtr,
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
index a34ea0229d53..100e92f8f866 100644
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@@ -858,7 +858,7 @@ static int __must_check make_partition(struct layout *layout, enum partition_id
/**
* vdo_initialize_layout() - Lay out the partitions of a vdo.
* @size: The entire size of the vdo.
- * @origin: The start of the layout on the underlying storage in blocks.
+ * @offset: The start of the layout on the underlying storage in blocks.
* @block_map_blocks: The size of the block map partition.
* @journal_blocks: The size of the journal partition.
* @summary_blocks: The size of the slab summary partition.
diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c
index 57e87f0d7069..dd4fdee2ca0c 100644
--- a/drivers/md/dm-vdo/flush.c
+++ b/drivers/md/dm-vdo/flush.c
@@ -369,8 +369,7 @@ void vdo_dump_flusher(const struct flusher *flusher)
static void initialize_flush(struct vdo_flush *flush, struct vdo *vdo)
{
bio_list_init(&flush->bios);
- bio_list_merge(&flush->bios, &vdo->flusher->waiting_flush_bios);
- bio_list_init(&vdo->flusher->waiting_flush_bios);
+ bio_list_merge_init(&flush->bios, &vdo->flusher->waiting_flush_bios);
}
static void launch_flush(struct vdo_flush *flush)
diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c
index 7e32a25d3f2f..fb1db41c794b 100644
--- a/drivers/md/dm-vdo/indexer/chapter-index.c
+++ b/drivers/md/dm-vdo/indexer/chapter-index.c
@@ -177,7 +177,7 @@ int uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index,
if (list_number < 0)
return UDS_OVERFLOW;
- next_list = first_list + list_number--,
+ next_list = first_list + list_number--;
result = uds_start_delta_index_search(delta_index, next_list, 0,
&entry);
if (result != UDS_SUCCESS)
diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
index 627adc24af3b..af8fab83b0f3 100644
--- a/drivers/md/dm-vdo/indexer/index-layout.c
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@@ -248,32 +248,6 @@ static int __must_check compute_sizes(const struct uds_configuration *config,
return UDS_SUCCESS;
}
-int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_size)
-{
- int result;
- struct uds_configuration *index_config;
- struct save_layout_sizes sizes;
-
- if (index_size == NULL) {
- vdo_log_error("Missing output size pointer");
- return -EINVAL;
- }
-
- result = uds_make_configuration(parameters, &index_config);
- if (result != UDS_SUCCESS) {
- vdo_log_error_strerror(result, "cannot compute index size");
- return uds_status_to_errno(result);
- }
-
- result = compute_sizes(index_config, &sizes);
- uds_free_configuration(index_config);
- if (result != UDS_SUCCESS)
- return uds_status_to_errno(result);
-
- *index_size = sizes.total_size;
- return UDS_SUCCESS;
-}
-
/* Create unique data using the current time and a pseudorandom number. */
static void create_unique_nonce_data(u8 *buffer)
{
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
index 1ba767144426..df4934846244 100644
--- a/drivers/md/dm-vdo/indexer/index.c
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -197,15 +197,12 @@ static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_
static int swap_open_chapter(struct index_zone *zone)
{
int result;
- struct open_chapter_zone *temporary_chapter;
result = finish_previous_chapter(zone->index, zone->newest_virtual_chapter);
if (result != UDS_SUCCESS)
return result;
- temporary_chapter = zone->open_chapter;
- zone->open_chapter = zone->writing_chapter;
- zone->writing_chapter = temporary_chapter;
+ swap(zone->open_chapter, zone->writing_chapter);
return UDS_SUCCESS;
}
diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h
index 3744aaf625b0..183a94eb7e92 100644
--- a/drivers/md/dm-vdo/indexer/indexer.h
+++ b/drivers/md/dm-vdo/indexer/indexer.h
@@ -283,10 +283,6 @@ struct uds_request {
enum uds_index_region location;
};
-/* Compute the number of bytes needed to store an index. */
-int __must_check uds_compute_index_size(const struct uds_parameters *parameters,
- u64 *index_size);
-
/* A session is required for most index operations. */
int __must_check uds_create_index_session(struct uds_index_session **session);
diff --git a/drivers/md/dm-vdo/indexer/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c
index 515765d35794..1bee9d63dc0a 100644
--- a/drivers/md/dm-vdo/indexer/io-factory.c
+++ b/drivers/md/dm-vdo/indexer/io-factory.c
@@ -90,7 +90,7 @@ void uds_put_io_factory(struct io_factory *factory)
size_t uds_get_writable_size(struct io_factory *factory)
{
- return i_size_read(factory->bdev->bd_inode);
+ return bdev_nr_bytes(factory->bdev);
}
/* Create a struct dm_bufio_client for an index region starting at offset. */
diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c
index 3aa438f84ea1..aeb690415dbd 100644
--- a/drivers/md/dm-vdo/int-map.c
+++ b/drivers/md/dm-vdo/int-map.c
@@ -70,7 +70,7 @@
* it's crucial to keep the hop fields near the buckets that they use them so they'll tend to share
* cache lines.
*/
-struct __packed bucket {
+struct bucket {
/**
* @first_hop: The biased offset of the first entry in the hop list of the neighborhood
* that hashes to this bucket.
@@ -82,7 +82,7 @@ struct __packed bucket {
u64 key;
/** @value: The value stored in this bucket (NULL if empty). */
void *value;
-};
+} __packed;
/**
* struct int_map - The concrete definition of the opaque int_map type.
@@ -96,7 +96,7 @@ struct int_map {
size_t size;
/** @capacity: The number of neighborhoods in the map. */
size_t capacity;
- /* @bucket_count: The number of buckets in the bucket array. */
+ /** @bucket_count: The number of buckets in the bucket array. */
size_t bucket_count;
/** @buckets: The array of hash buckets. */
struct bucket *buckets;
@@ -310,7 +310,6 @@ static struct bucket *select_bucket(const struct int_map *map, u64 key)
/**
* search_hop_list() - Search the hop list associated with given hash bucket for a given search
* key.
- * @map: The map being searched.
* @bucket: The map bucket to search for the key.
* @key: The mapping key.
* @previous_ptr: Output. if not NULL, a pointer in which to store the bucket in the list preceding
@@ -321,9 +320,7 @@ static struct bucket *select_bucket(const struct int_map *map, u64 key)
*
* Return: An entry that matches the key, or NULL if not found.
*/
-static struct bucket *search_hop_list(struct int_map *map __always_unused,
- struct bucket *bucket,
- u64 key,
+static struct bucket *search_hop_list(struct bucket *bucket, u64 key,
struct bucket **previous_ptr)
{
struct bucket *previous = NULL;
@@ -357,7 +354,7 @@ static struct bucket *search_hop_list(struct int_map *map __always_unused,
*/
void *vdo_int_map_get(struct int_map *map, u64 key)
{
- struct bucket *match = search_hop_list(map, select_bucket(map, key), key, NULL);
+ struct bucket *match = search_hop_list(select_bucket(map, key), key, NULL);
return ((match != NULL) ? match->value : NULL);
}
@@ -443,7 +440,6 @@ find_empty_bucket(struct int_map *map, struct bucket *bucket, unsigned int max_p
/**
* move_empty_bucket() - Move an empty bucket closer to the start of the bucket array.
- * @map: The map containing the bucket.
* @hole: The empty bucket to fill with an entry that precedes it in one of its enclosing
* neighborhoods.
*
@@ -454,8 +450,7 @@ find_empty_bucket(struct int_map *map, struct bucket *bucket, unsigned int max_p
* Return: The bucket that was vacated by moving its entry to the provided hole, or NULL if no
* entry could be moved.
*/
-static struct bucket *move_empty_bucket(struct int_map *map __always_unused,
- struct bucket *hole)
+static struct bucket *move_empty_bucket(struct bucket *hole)
{
/*
* Examine every neighborhood that the empty bucket is part of, starting with the one in
@@ -516,7 +511,6 @@ static struct bucket *move_empty_bucket(struct int_map *map __always_unused,
/**
* update_mapping() - Find and update any existing mapping for a given key, returning the value
* associated with the key in the provided pointer.
- * @map: The int_map to attempt to modify.
* @neighborhood: The first bucket in the neighborhood that would contain the search key
* @key: The key with which to associate the new value.
* @new_value: The value to be associated with the key.
@@ -525,10 +519,10 @@ static struct bucket *move_empty_bucket(struct int_map *map __always_unused,
*
* Return: true if the map contains a mapping for the key, false if it does not.
*/
-static bool update_mapping(struct int_map *map, struct bucket *neighborhood,
- u64 key, void *new_value, bool update, void **old_value_ptr)
+static bool update_mapping(struct bucket *neighborhood, u64 key, void *new_value,
+ bool update, void **old_value_ptr)
{
- struct bucket *bucket = search_hop_list(map, neighborhood, key, NULL);
+ struct bucket *bucket = search_hop_list(neighborhood, key, NULL);
if (bucket == NULL) {
/* There is no bucket containing the key in the neighborhood. */
@@ -584,7 +578,7 @@ static struct bucket *find_or_make_vacancy(struct int_map *map,
* The nearest empty bucket isn't within the neighborhood that must contain the new
* entry, so try to swap it with bucket that is closer.
*/
- hole = move_empty_bucket(map, hole);
+ hole = move_empty_bucket(hole);
}
return NULL;
@@ -625,7 +619,7 @@ int vdo_int_map_put(struct int_map *map, u64 key, void *new_value, bool update,
* Check whether the neighborhood already contains an entry for the key, in which case we
* optionally update it, returning the old value.
*/
- if (update_mapping(map, neighborhood, key, new_value, update, old_value_ptr))
+ if (update_mapping(neighborhood, key, new_value, update, old_value_ptr))
return VDO_SUCCESS;
/*
@@ -679,7 +673,7 @@ void *vdo_int_map_remove(struct int_map *map, u64 key)
/* Select the bucket to search and search it for an existing entry. */
struct bucket *bucket = select_bucket(map, key);
struct bucket *previous;
- struct bucket *victim = search_hop_list(map, bucket, key, &previous);
+ struct bucket *victim = search_hop_list(bucket, key, &previous);
if (victim == NULL) {
/* There is no matching entry to remove. */
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
index 9a3716bb3c05..421e5436c32c 100644
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -346,7 +346,6 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
VDO_ASSERT_LOG_ONLY(!code->quiescent, "I/O not allowed in state %s", code->name);
- VDO_ASSERT_LOG_ONLY(vio->bio->bi_next == NULL, "metadata bio has no next bio");
vdo_reset_completion(completion);
completion->error_handler = error_handler;
@@ -368,7 +367,7 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
* completions.
* @max_requests_active: Number of bios for merge tracking.
* @vdo: The vdo which will use this submitter.
- * @io_submitter: pointer to the new data structure.
+ * @io_submitter_ptr: pointer to the new data structure.
*
* Return: VDO_SUCCESS or an error.
*/
diff --git a/drivers/md/dm-vdo/message-stats.c b/drivers/md/dm-vdo/message-stats.c
index 2802cf92922b..75dfcd7c5f63 100644
--- a/drivers/md/dm-vdo/message-stats.c
+++ b/drivers/md/dm-vdo/message-stats.c
@@ -4,6 +4,7 @@
*/
#include "dedupe.h"
+#include "indexer.h"
#include "logger.h"
#include "memory-alloc.h"
#include "message-stats.h"
@@ -430,3 +431,50 @@ int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen)
vdo_free(stats);
return VDO_SUCCESS;
}
+
+static void write_index_memory(u32 mem, char **buf, unsigned int *maxlen)
+{
+ char *prefix = "memorySize : ";
+
+ /* Convert index memory to fractional value */
+ if (mem == (u32)UDS_MEMORY_CONFIG_256MB)
+ write_string(prefix, "0.25, ", NULL, buf, maxlen);
+ else if (mem == (u32)UDS_MEMORY_CONFIG_512MB)
+ write_string(prefix, "0.50, ", NULL, buf, maxlen);
+ else if (mem == (u32)UDS_MEMORY_CONFIG_768MB)
+ write_string(prefix, "0.75, ", NULL, buf, maxlen);
+ else
+ write_u32(prefix, mem, ", ", buf, maxlen);
+}
+
+static void write_index_config(struct index_config *config, char **buf,
+ unsigned int *maxlen)
+{
+ write_string("index : ", "{ ", NULL, buf, maxlen);
+ /* index mem size */
+ write_index_memory(config->mem, buf, maxlen);
+ /* whether the index is sparse or not */
+ write_bool("isSparse : ", config->sparse, ", ", buf, maxlen);
+ write_string(NULL, "}", ", ", buf, maxlen);
+}
+
+int vdo_write_config(struct vdo *vdo, char **buf, unsigned int *maxlen)
+{
+ struct vdo_config *config = &vdo->states.vdo.config;
+
+ write_string(NULL, "{ ", NULL, buf, maxlen);
+ /* version */
+ write_u32("version : ", 1, ", ", buf, maxlen);
+ /* physical size */
+ write_block_count_t("physicalSize : ", config->physical_blocks * VDO_BLOCK_SIZE, ", ",
+ buf, maxlen);
+ /* logical size */
+ write_block_count_t("logicalSize : ", config->logical_blocks * VDO_BLOCK_SIZE, ", ",
+ buf, maxlen);
+ /* slab size */
+ write_block_count_t("slabSize : ", config->slab_size, ", ", buf, maxlen);
+ /* index config */
+ write_index_config(&vdo->geometry.index_config, buf, maxlen);
+ write_string(NULL, "}", NULL, buf, maxlen);
+ return VDO_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/message-stats.h b/drivers/md/dm-vdo/message-stats.h
index f7fceca9acab..f9c95eff569d 100644
--- a/drivers/md/dm-vdo/message-stats.h
+++ b/drivers/md/dm-vdo/message-stats.h
@@ -8,6 +8,7 @@
#include "types.h"
+int vdo_write_config(struct vdo *vdo, char **buf, unsigned int *maxlen);
int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen);
#endif /* VDO_MESSAGE_STATS_H */
diff --git a/drivers/md/dm-vdo/murmurhash3.c b/drivers/md/dm-vdo/murmurhash3.c
index 3a989efae142..b0b0587d85f3 100644
--- a/drivers/md/dm-vdo/murmurhash3.c
+++ b/drivers/md/dm-vdo/murmurhash3.c
@@ -8,7 +8,7 @@
#include "murmurhash3.h"
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
static inline u64 rotl64(u64 x, s8 r)
{
@@ -44,14 +44,11 @@ void murmurhash3_128(const void *key, const int len, const u32 seed, void *out)
u64 *hash_out = out;
/* body */
-
- const u64 *blocks = (const u64 *)(data);
-
int i;
for (i = 0; i < nblocks; i++) {
- u64 k1 = get_unaligned_le64(&blocks[i * 2]);
- u64 k2 = get_unaligned_le64(&blocks[i * 2 + 1]);
+ u64 k1 = get_unaligned_le64(&data[i * 16]);
+ u64 k2 = get_unaligned_le64(&data[i * 16 + 8]);
k1 *= c1;
k1 = ROTL64(k1, 31);
diff --git a/drivers/md/dm-vdo/numeric.h b/drivers/md/dm-vdo/numeric.h
index dc8c400b21d2..f568dc59e6f1 100644
--- a/drivers/md/dm-vdo/numeric.h
+++ b/drivers/md/dm-vdo/numeric.h
@@ -6,7 +6,7 @@
#ifndef UDS_NUMERIC_H
#define UDS_NUMERIC_H
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/kernel.h>
#include <linux/types.h>
diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c
index 16cf29b4c90a..f70f5edabc10 100644
--- a/drivers/md/dm-vdo/packer.c
+++ b/drivers/md/dm-vdo/packer.c
@@ -250,7 +250,6 @@ static void abort_packing(struct data_vio *data_vio)
/**
* release_compressed_write_waiter() - Update a data_vio for which a successful compressed write
* has completed and send it on its way.
-
* @data_vio: The data_vio to release.
* @allocation: The allocation to which the compressed block was written.
*/
@@ -383,7 +382,7 @@ static void initialize_compressed_block(struct compressed_block *block, u16 size
* @compression: The agent's compression_state to pack in to.
* @data_vio: The data_vio to pack.
* @offset: The offset into the compressed block at which to pack the fragment.
- * @compressed_block: The compressed block which will be written out when batch is fully packed.
+ * @block: The compressed block which will be written out when batch is fully packed.
*
* Return: The new amount of space used.
*/
diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c
index 2fee3a7c1191..a43b5c45fab7 100644
--- a/drivers/md/dm-vdo/physical-zone.c
+++ b/drivers/md/dm-vdo/physical-zone.c
@@ -517,7 +517,7 @@ static int allocate_and_lock_block(struct allocation *allocation)
* @waiter: The allocating_vio that was waiting to allocate.
* @context: The context (unused).
*/
-static void retry_allocation(struct vdo_waiter *waiter, void *context __always_unused)
+static void retry_allocation(struct vdo_waiter *waiter, void __always_unused *context)
{
struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c
index ee6321a3e523..de58184f538f 100644
--- a/drivers/md/dm-vdo/recovery-journal.c
+++ b/drivers/md/dm-vdo/recovery-journal.c
@@ -1365,7 +1365,7 @@ static void add_queued_recovery_entries(struct recovery_journal_block *block)
*
* Implements waiter_callback_fn.
*/
-static void write_block(struct vdo_waiter *waiter, void *context __always_unused)
+static void write_block(struct vdo_waiter *waiter, void __always_unused *context)
{
struct recovery_journal_block *block =
container_of(waiter, struct recovery_journal_block, write_waiter);
diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c
index defc9359f10e..8c006fb3afcf 100644
--- a/drivers/md/dm-vdo/repair.c
+++ b/drivers/md/dm-vdo/repair.c
@@ -51,6 +51,8 @@ struct recovery_point {
bool increment_applied;
};
+DEFINE_MIN_HEAP(struct numbered_block_mapping, replay_heap);
+
struct repair_completion {
/* The completion header */
struct vdo_completion completion;
@@ -97,7 +99,7 @@ struct repair_completion {
* order, then original journal order. This permits efficient iteration over the journal
* entries in order.
*/
- struct min_heap replay_heap;
+ struct replay_heap replay_heap;
/* Fields tracking progress through the journal entries. */
struct numbered_block_mapping *current_entry;
struct numbered_block_mapping *current_unfetched_entry;
@@ -135,7 +137,7 @@ struct repair_completion {
* to sort by slot while still ensuring we replay all entries with the same slot in the exact order
* as they appeared in the journal.
*/
-static bool mapping_is_less_than(const void *item1, const void *item2)
+static bool mapping_is_less_than(const void *item1, const void *item2, void __always_unused *args)
{
const struct numbered_block_mapping *mapping1 =
(const struct numbered_block_mapping *) item1;
@@ -154,7 +156,7 @@ static bool mapping_is_less_than(const void *item1, const void *item2)
return 0;
}
-static void swap_mappings(void *item1, void *item2)
+static void swap_mappings(void *item1, void *item2, void __always_unused *args)
{
struct numbered_block_mapping *mapping1 = item1;
struct numbered_block_mapping *mapping2 = item2;
@@ -163,14 +165,13 @@ static void swap_mappings(void *item1, void *item2)
}
static const struct min_heap_callbacks repair_min_heap = {
- .elem_size = sizeof(struct numbered_block_mapping),
.less = mapping_is_less_than,
- .swp = swap_mappings,
+ .swp = NULL,
};
static struct numbered_block_mapping *sort_next_heap_element(struct repair_completion *repair)
{
- struct min_heap *heap = &repair->replay_heap;
+ struct replay_heap *heap = &repair->replay_heap;
struct numbered_block_mapping *last;
if (heap->nr == 0)
@@ -181,8 +182,8 @@ static struct numbered_block_mapping *sort_next_heap_element(struct repair_compl
* restore the heap invariant, and return a pointer to the popped element.
*/
last = &repair->entries[--heap->nr];
- swap_mappings(heap->data, last);
- min_heapify(heap, 0, &repair_min_heap);
+ swap_mappings(heap->data, last, NULL);
+ min_heap_sift_down(heap, 0, &repair_min_heap, NULL);
return last;
}
@@ -318,6 +319,7 @@ static bool __must_check abort_on_error(int result, struct repair_completion *re
/**
* drain_slab_depot() - Flush out all dirty refcounts blocks now that they have been rebuilt or
* recovered.
+ * @completion: The repair completion.
*/
static void drain_slab_depot(struct vdo_completion *completion)
{
@@ -653,9 +655,6 @@ static void rebuild_reference_counts(struct vdo_completion *completion)
vdo_traverse_forest(vdo->block_map, process_entry, completion);
}
-/**
- * increment_recovery_point() - Move the given recovery point forward by one entry.
- */
static void increment_recovery_point(struct recovery_point *point)
{
if (++point->entry_count < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR)
@@ -952,6 +951,7 @@ static void abort_block_map_recovery(struct repair_completion *repair, int resul
/**
* find_entry_starting_next_page() - Find the first journal entry after a given entry which is not
* on the same block map page.
+ * @repair: The repair completion.
* @current_entry: The entry to search from.
* @needs_sort: Whether sorting is needed to proceed.
*
@@ -1117,12 +1117,12 @@ static void recover_block_map(struct vdo_completion *completion)
* Organize the journal entries into a binary heap so we can iterate over them in sorted
* order incrementally, avoiding an expensive sort call.
*/
- repair->replay_heap = (struct min_heap) {
+ repair->replay_heap = (struct replay_heap) {
.data = repair->entries,
.nr = repair->block_map_entry_count,
.size = repair->block_map_entry_count,
};
- min_heapify_all(&repair->replay_heap, &repair_min_heap);
+ min_heapify_all(&repair->replay_heap, &repair_min_heap, NULL);
vdo_log_info("Replaying %zu recovery entries into block map",
repair->block_map_entry_count);
@@ -1202,22 +1202,20 @@ static bool __must_check is_valid_recovery_journal_block(const struct recovery_j
* @journal: The journal to use.
* @header: The unpacked block header to check.
* @sequence: The expected sequence number.
- * @type: The expected metadata type.
*
* Return: True if the block matches.
*/
static bool __must_check is_exact_recovery_journal_block(const struct recovery_journal *journal,
const struct recovery_block_header *header,
- sequence_number_t sequence,
- enum vdo_metadata_type type)
+ sequence_number_t sequence)
{
- return ((header->metadata_type == type) &&
- (header->sequence_number == sequence) &&
+ return ((header->sequence_number == sequence) &&
(is_valid_recovery_journal_block(journal, header, true)));
}
/**
* find_recovery_journal_head_and_tail() - Find the tail and head of the journal.
+ * @repair: The repair completion.
*
* Return: True if there were valid journal blocks.
*/
@@ -1370,7 +1368,8 @@ static void extract_entries_from_block(struct repair_completion *repair,
get_recovery_journal_block_header(journal, repair->journal_data,
sequence);
- if (!is_exact_recovery_journal_block(journal, &header, sequence, format)) {
+ if (!is_exact_recovery_journal_block(journal, &header, sequence) ||
+ (header.metadata_type != format)) {
/* This block is invalid, so skip it. */
return;
}
@@ -1446,6 +1445,7 @@ static int validate_heads(struct repair_completion *repair)
/**
* extract_new_mappings() - Find all valid new mappings to be applied to the block map.
+ * @repair: The repair completion.
*
* The mappings are extracted from the journal and stored in a sortable array so that all of the
* mappings to be applied to a given block map page can be done in a single page fetch.
@@ -1500,6 +1500,7 @@ static int extract_new_mappings(struct repair_completion *repair)
/**
* compute_usages() - Compute the lbns in use and block map data blocks counts from the tail of
* the journal.
+ * @repair: The repair completion.
*/
static noinline int compute_usages(struct repair_completion *repair)
{
@@ -1554,10 +1555,13 @@ static int parse_journal_for_recovery(struct repair_completion *repair)
sequence_number_t i, head;
bool found_entries = false;
struct recovery_journal *journal = repair->completion.vdo->recovery_journal;
+ struct recovery_block_header header;
+ enum vdo_metadata_type expected_format;
head = min(repair->block_map_head, repair->slab_journal_head);
+ header = get_recovery_journal_block_header(journal, repair->journal_data, head);
+ expected_format = header.metadata_type;
for (i = head; i <= repair->highest_tail; i++) {
- struct recovery_block_header header;
journal_entry_count_t block_entries;
u8 j;
@@ -1569,19 +1573,15 @@ static int parse_journal_for_recovery(struct repair_completion *repair)
};
header = get_recovery_journal_block_header(journal, repair->journal_data, i);
- if (header.metadata_type == VDO_METADATA_RECOVERY_JOURNAL) {
- /* This is an old format block, so we need to upgrade */
- vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
- "Recovery journal is in the old format, a read-only rebuild is required.");
- vdo_enter_read_only_mode(repair->completion.vdo,
- VDO_UNSUPPORTED_VERSION);
- return VDO_UNSUPPORTED_VERSION;
- }
-
- if (!is_exact_recovery_journal_block(journal, &header, i,
- VDO_METADATA_RECOVERY_JOURNAL_2)) {
+ if (!is_exact_recovery_journal_block(journal, &header, i)) {
/* A bad block header was found so this must be the end of the journal. */
break;
+ } else if (header.metadata_type != expected_format) {
+ /* There is a mix of old and new format blocks, so we need to rebuild. */
+ vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
+ "Recovery journal is in an invalid format, a read-only rebuild is required.");
+ vdo_enter_read_only_mode(repair->completion.vdo, VDO_CORRUPT_JOURNAL);
+ return VDO_CORRUPT_JOURNAL;
}
block_entries = header.entry_count;
@@ -1617,8 +1617,14 @@ static int parse_journal_for_recovery(struct repair_completion *repair)
break;
}
- if (!found_entries)
+ if (!found_entries) {
return validate_heads(repair);
+ } else if (expected_format == VDO_METADATA_RECOVERY_JOURNAL) {
+ /* All journal blocks have the old format, so we need to upgrade. */
+ vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+ "Recovery journal is in the old format. Downgrade and complete recovery, then upgrade with a clean volume");
+ return VDO_UNSUPPORTED_VERSION;
+ }
/* Set the tail to the last valid tail block, if there is one. */
if (repair->tail_recovery_point.sector_count == 0)
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index 46e4721e5b4f..8f0a35c63af6 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -1287,7 +1287,7 @@ static struct reference_block * __must_check get_reference_block(struct vdo_slab
* slab_block_number_from_pbn() - Determine the index within the slab of a particular physical
* block number.
* @slab: The slab.
- * @physical_block_number: The physical block number.
+ * @pbn: The physical block number.
* @slab_block_number_ptr: A pointer to the slab block number.
*
* Return: VDO_SUCCESS or an error code.
@@ -1459,7 +1459,6 @@ static int increment_for_data(struct vdo_slab *slab, struct reference_block *blo
* @block_number: The block to update.
* @old_status: The reference status of the data block before this decrement.
* @updater: The reference updater doing this operation in case we need to look up the pbn lock.
- * @lock: The pbn_lock associated with the block being decremented (may be NULL).
* @counter_ptr: A pointer to the count for the data block (in, out).
* @adjust_block_count: Whether to update the allocator's free block count.
*
@@ -3232,8 +3231,7 @@ int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator,
/**
* vdo_modify_reference_count() - Modify the reference count of a block by first making a slab
* journal entry and then updating the reference counter.
- *
- * @data_vio: The data_vio for which to add the entry.
+ * @completion: The data_vio completion for which to add the entry.
* @updater: Which of the data_vio's reference updaters is being submitted.
*/
void vdo_modify_reference_count(struct vdo_completion *completion,
@@ -3288,7 +3286,8 @@ int vdo_release_block_reference(struct block_allocator *allocator,
* Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements
* before larger ones.
*/
-static bool slab_status_is_less_than(const void *item1, const void *item2)
+static bool slab_status_is_less_than(const void *item1, const void *item2,
+ void __always_unused *args)
{
const struct slab_status *info1 = item1;
const struct slab_status *info2 = item2;
@@ -3300,18 +3299,9 @@ static bool slab_status_is_less_than(const void *item1, const void *item2)
return info1->slab_number < info2->slab_number;
}
-static void swap_slab_statuses(void *item1, void *item2)
-{
- struct slab_status *info1 = item1;
- struct slab_status *info2 = item2;
-
- swap(*info1, *info2);
-}
-
static const struct min_heap_callbacks slab_status_min_heap = {
- .elem_size = sizeof(struct slab_status),
.less = slab_status_is_less_than,
- .swp = swap_slab_statuses,
+ .swp = NULL,
};
/* Inform the slab actor that a action has finished on some slab; used by apply_to_slabs(). */
@@ -3509,7 +3499,7 @@ static int get_slab_statuses(struct block_allocator *allocator,
static int __must_check vdo_prepare_slabs_for_allocation(struct block_allocator *allocator)
{
struct slab_status current_slab_status;
- struct min_heap heap;
+ DEFINE_MIN_HEAP(struct slab_status, heap) heap;
int result;
struct slab_status *slab_statuses;
struct slab_depot *depot = allocator->depot;
@@ -3521,12 +3511,12 @@ static int __must_check vdo_prepare_slabs_for_allocation(struct block_allocator
return result;
/* Sort the slabs by cleanliness, then by emptiness hint. */
- heap = (struct min_heap) {
+ heap = (struct heap) {
.data = slab_statuses,
.nr = allocator->slab_count,
.size = allocator->slab_count,
};
- min_heapify_all(&heap, &slab_status_min_heap);
+ min_heapify_all(&heap, &slab_status_min_heap, NULL);
while (heap.nr > 0) {
bool high_priority;
@@ -3534,7 +3524,7 @@ static int __must_check vdo_prepare_slabs_for_allocation(struct block_allocator
struct slab_journal *journal;
current_slab_status = slab_statuses[0];
- min_heap_pop(&heap, &slab_status_min_heap);
+ min_heap_pop(&heap, &slab_status_min_heap, NULL);
slab = depot->slabs[current_slab_status.slab_number];
if ((depot->load_type == VDO_SLAB_DEPOT_REBUILD_LOAD) ||
@@ -4750,8 +4740,7 @@ void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent)
/**
* stop_scrubbing() - Tell the scrubber to stop scrubbing after it finishes the slab it is
* currently working on.
- * @scrubber: The scrubber to stop.
- * @parent: The completion to notify when scrubbing has stopped.
+ * @allocator: The block allocator owning the scrubber to stop.
*/
static void stop_scrubbing(struct block_allocator *allocator)
{
diff --git a/drivers/md/dm-vdo/status-codes.c b/drivers/md/dm-vdo/status-codes.c
index d3493450b169..dd252d660b6d 100644
--- a/drivers/md/dm-vdo/status-codes.c
+++ b/drivers/md/dm-vdo/status-codes.c
@@ -28,7 +28,7 @@ const struct error_info vdo_status_list[] = {
{ "VDO_LOCK_ERROR", "A lock is held incorrectly" },
{ "VDO_READ_ONLY", "The device is in read-only mode" },
{ "VDO_SHUTTING_DOWN", "The device is shutting down" },
- { "VDO_CORRUPT_JOURNAL", "Recovery journal entries corrupted" },
+ { "VDO_CORRUPT_JOURNAL", "Recovery journal corrupted" },
{ "VDO_TOO_MANY_SLABS", "Exceeds maximum number of slabs supported" },
{ "VDO_INVALID_FRAGMENT", "Compressed block fragment is invalid" },
{ "VDO_RETRY_AFTER_REBUILD", "Retry operation after rebuilding finishes" },
diff --git a/drivers/md/dm-vdo/status-codes.h b/drivers/md/dm-vdo/status-codes.h
index 72da04159f88..426dc8e2ca5d 100644
--- a/drivers/md/dm-vdo/status-codes.h
+++ b/drivers/md/dm-vdo/status-codes.h
@@ -52,7 +52,7 @@ enum vdo_status_codes {
VDO_READ_ONLY,
/* the VDO is shutting down */
VDO_SHUTTING_DOWN,
- /* the recovery journal has corrupt entries */
+ /* the recovery journal has corrupt entries or corrupt metadata */
VDO_CORRUPT_JOURNAL,
/* exceeds maximum number of slabs supported */
VDO_TOO_MANY_SLABS,
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index fff847767755..a7e32baab4af 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -643,7 +643,7 @@ static void finish_vdo(struct vdo *vdo)
/**
* free_listeners() - Free the list of read-only listeners associated with a thread.
- * @thread_data: The thread holding the list to free.
+ * @thread: The thread holding the list to free.
*/
static void free_listeners(struct vdo_thread *thread)
{
@@ -852,7 +852,7 @@ int vdo_synchronous_flush(struct vdo *vdo)
/**
* vdo_get_state() - Get the current state of the vdo.
* @vdo: The vdo.
-
+ *
* Context: This method may be called from any thread.
*
* Return: The current state of the vdo.
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
index b291578f726f..e710f3c5a972 100644
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@@ -202,6 +202,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
if (data == NULL)
return VDO_SUCCESS;
+ bio->bi_ioprio = 0;
bio->bi_io_vec = bio->bi_inline_vecs;
bio->bi_max_vecs = vio->block_count + 1;
len = VDO_BLOCK_SIZE * vio->block_count;
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index e46aee6f932e..0c41949db784 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -40,35 +40,23 @@ static inline u64 fec_interleave(struct dm_verity *v, u64 offset)
}
/*
- * Decode an RS block using Reed-Solomon.
- */
-static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio,
- u8 *data, u8 *fec, int neras)
-{
- int i;
- uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
-
- for (i = 0; i < v->fec->roots; i++)
- par[i] = fec[i];
-
- return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras,
- fio->erasures, 0, NULL);
-}
-
-/*
* Read error-correcting codes for the requested RS block. Returns a pointer
* to the data block. Caller is responsible for releasing buf.
*/
static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index,
- unsigned int *offset, struct dm_buffer **buf,
- unsigned short ioprio)
+ unsigned int *offset, unsigned int par_buf_offset,
+ struct dm_buffer **buf, unsigned short ioprio)
{
u64 position, block, rem;
u8 *res;
+ /* We have already part of parity bytes read, skip to the next block */
+ if (par_buf_offset)
+ index++;
+
position = (index + rsb) * v->fec->roots;
block = div64_u64_rem(position, v->fec->io_size, &rem);
- *offset = (unsigned int)rem;
+ *offset = par_buf_offset ? 0 : (unsigned int)rem;
res = dm_bufio_read_with_ioprio(v->fec->bufio, block, buf, ioprio);
if (IS_ERR(res)) {
@@ -128,11 +116,13 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
{
int r, corrected = 0, res;
struct dm_buffer *buf;
- unsigned int n, i, offset;
+ unsigned int n, i, j, offset, par_buf_offset = 0;
+ uint16_t par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
u8 *par, *block;
struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
- par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio));
+ par = fec_read_parity(v, rsb, block_offset, &offset,
+ par_buf_offset, &buf, bio->bi_ioprio);
if (IS_ERR(par))
return PTR_ERR(par);
@@ -142,7 +132,11 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
*/
fec_for_each_buffer_rs_block(fio, n, i) {
block = fec_buffer_rs_block(v, fio, n, i);
- res = fec_decode_rs8(v, fio, block, &par[offset], neras);
+ for (j = 0; j < v->fec->roots - par_buf_offset; j++)
+ par_buf[par_buf_offset + j] = par[offset + j];
+ /* Decode an RS block using Reed-Solomon */
+ res = decode_rs8(fio->rs, block, par_buf, v->fec->rsn,
+ NULL, neras, fio->erasures, 0, NULL);
if (res < 0) {
r = res;
goto error;
@@ -155,12 +149,22 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
if (block_offset >= 1 << v->data_dev_block_bits)
goto done;
- /* read the next block when we run out of parity bytes */
- offset += v->fec->roots;
+ /* Read the next block when we run out of parity bytes */
+ offset += (v->fec->roots - par_buf_offset);
+ /* Check if parity bytes are split between blocks */
+ if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) {
+ par_buf_offset = v->fec->io_size - offset;
+ for (j = 0; j < par_buf_offset; j++)
+ par_buf[j] = par[offset + j];
+ offset += par_buf_offset;
+ } else
+ par_buf_offset = 0;
+
if (offset >= v->fec->io_size) {
dm_bufio_release(buf);
- par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio));
+ par = fec_read_parity(v, rsb, block_offset, &offset,
+ par_buf_offset, &buf, bio->bi_ioprio);
if (IS_ERR(par))
return PTR_ERR(par);
}
@@ -186,8 +190,7 @@ error:
static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
u8 *want_digest, u8 *data)
{
- if (unlikely(verity_hash(v, verity_io_hash_req(v, io),
- data, 1 << v->data_dev_block_bits,
+ if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits,
verity_io_real_digest(v, io), true)))
return 0;
@@ -251,7 +254,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
bufio = v->bufio;
}
- bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio_prio(bio));
+ bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio->bi_ioprio);
if (IS_ERR(bbuf)) {
DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld",
v->data_dev->name,
@@ -388,8 +391,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
}
/* Always re-validate the corrected block against the expected hash */
- r = verity_hash(v, verity_io_hash_req(v, io), fio->output,
- 1 << v->data_dev_block_bits,
+ r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits,
verity_io_real_digest(v, io), true);
if (unlikely(r < 0))
return r;
@@ -404,24 +406,9 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
return 0;
}
-static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data,
- size_t len)
-{
- struct dm_verity_fec_io *fio = fec_io(io);
-
- memcpy(data, &fio->output[fio->output_pos], len);
- fio->output_pos += len;
-
- return 0;
-}
-
-/*
- * Correct errors in a block. Copies corrected block to dest if non-NULL,
- * otherwise to a bio_vec starting from iter.
- */
+/* Correct errors in a block. Copies corrected block to dest. */
int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
- enum verity_block_type type, sector_t block, u8 *dest,
- struct bvec_iter *iter)
+ enum verity_block_type type, sector_t block, u8 *dest)
{
int r;
struct dm_verity_fec_io *fio = fec_io(io);
@@ -471,12 +458,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
goto done;
}
- if (dest)
- memcpy(dest, fio->output, 1 << v->data_dev_block_bits);
- else if (iter) {
- fio->output_pos = 0;
- r = verity_for_bv_block(v, io, iter, fec_bv_copy);
- }
+ memcpy(dest, fio->output, 1 << v->data_dev_block_bits);
done:
fio->level--;
@@ -746,10 +728,7 @@ int verity_fec_ctr(struct dm_verity *v)
return -E2BIG;
}
- if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1))
- f->io_size = 1 << v->data_dev_block_bits;
- else
- f->io_size = v->fec->roots << SECTOR_SHIFT;
+ f->io_size = 1 << v->data_dev_block_bits;
f->bufio = dm_bufio_client_create(f->dev->bdev,
f->io_size,
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 8454070d2824..09123a612953 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -57,7 +57,6 @@ struct dm_verity_fec_io {
u8 *bufs[DM_VERITY_FEC_BUF_MAX]; /* bufs for deinterleaving */
unsigned int nbufs; /* number of buffers allocated */
u8 *output; /* buffer for corrected output */
- size_t output_pos;
unsigned int level; /* recursion level */
};
@@ -70,7 +69,7 @@ extern bool verity_fec_is_enabled(struct dm_verity *v);
extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
enum verity_block_type type, sector_t block,
- u8 *dest, struct bvec_iter *iter);
+ u8 *dest);
extern unsigned int verity_fec_status_table(struct dm_verity *v, unsigned int sz,
char *result, unsigned int maxlen);
@@ -100,8 +99,7 @@ static inline bool verity_fec_is_enabled(struct dm_verity *v)
static inline int verity_fec_decode(struct dm_verity *v,
struct dm_verity_io *io,
enum verity_block_type type,
- sector_t block, u8 *dest,
- struct bvec_iter *iter)
+ sector_t block, u8 *dest)
{
return -EOPNOTSUPP;
}
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index bb5da66da4c1..e86c1431b108 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -22,6 +22,7 @@
#include <linux/scatterlist.h>
#include <linux/string.h>
#include <linux/jump_label.h>
+#include <linux/security.h>
#define DM_MSG_PREFIX "verity"
@@ -35,11 +36,13 @@
#define DM_VERITY_OPT_LOGGING "ignore_corruption"
#define DM_VERITY_OPT_RESTART "restart_on_corruption"
#define DM_VERITY_OPT_PANIC "panic_on_corruption"
+#define DM_VERITY_OPT_ERROR_RESTART "restart_on_error"
+#define DM_VERITY_OPT_ERROR_PANIC "panic_on_error"
#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once"
#define DM_VERITY_OPT_TASKLET_VERIFY "try_verify_in_tasklet"
-#define DM_VERITY_OPTS_MAX (4 + DM_VERITY_OPTS_FEC + \
+#define DM_VERITY_OPTS_MAX (5 + DM_VERITY_OPTS_FEC + \
DM_VERITY_ROOT_HASH_VERIFICATION_OPTS)
static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
@@ -48,6 +51,9 @@ module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644);
static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled);
+/* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */
+static DEFINE_STATIC_KEY_FALSE(ahash_enabled);
+
struct dm_verity_prefetch_work {
struct work_struct work;
struct dm_verity *v;
@@ -87,7 +93,7 @@ static void dm_bufio_alloc_callback(struct dm_buffer *buf)
*/
static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector)
{
- return v->data_start + dm_target_offset(v->ti, bi_sector);
+ return dm_target_offset(v->ti, bi_sector);
}
/*
@@ -102,7 +108,7 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
return block >> (level * v->hash_per_block_bits);
}
-static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
+static int verity_ahash_update(struct dm_verity *v, struct ahash_request *req,
const u8 *data, size_t len,
struct crypto_wait *wait)
{
@@ -135,12 +141,12 @@ static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
/*
* Wrapper for crypto_ahash_init, which handles verity salting.
*/
-static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
+static int verity_ahash_init(struct dm_verity *v, struct ahash_request *req,
struct crypto_wait *wait, bool may_sleep)
{
int r;
- ahash_request_set_tfm(req, v->tfm);
+ ahash_request_set_tfm(req, v->ahash_tfm);
ahash_request_set_callback(req,
may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0,
crypto_req_done, (void *)wait);
@@ -155,18 +161,18 @@ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
}
if (likely(v->salt_size && (v->version >= 1)))
- r = verity_hash_update(v, req, v->salt, v->salt_size, wait);
+ r = verity_ahash_update(v, req, v->salt, v->salt_size, wait);
return r;
}
-static int verity_hash_final(struct dm_verity *v, struct ahash_request *req,
- u8 *digest, struct crypto_wait *wait)
+static int verity_ahash_final(struct dm_verity *v, struct ahash_request *req,
+ u8 *digest, struct crypto_wait *wait)
{
int r;
if (unlikely(v->salt_size && (!v->version))) {
- r = verity_hash_update(v, req, v->salt, v->salt_size, wait);
+ r = verity_ahash_update(v, req, v->salt, v->salt_size, wait);
if (r < 0) {
DMERR("%s failed updating salt: %d", __func__, r);
@@ -180,23 +186,27 @@ out:
return r;
}
-int verity_hash(struct dm_verity *v, struct ahash_request *req,
+int verity_hash(struct dm_verity *v, struct dm_verity_io *io,
const u8 *data, size_t len, u8 *digest, bool may_sleep)
{
int r;
- struct crypto_wait wait;
-
- r = verity_hash_init(v, req, &wait, may_sleep);
- if (unlikely(r < 0))
- goto out;
- r = verity_hash_update(v, req, data, len, &wait);
- if (unlikely(r < 0))
- goto out;
+ if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) {
+ struct ahash_request *req = verity_io_hash_req(v, io);
+ struct crypto_wait wait;
- r = verity_hash_final(v, req, digest, &wait);
+ r = verity_ahash_init(v, req, &wait, may_sleep) ?:
+ verity_ahash_update(v, req, data, len, &wait) ?:
+ verity_ahash_final(v, req, digest, &wait);
+ } else {
+ struct shash_desc *desc = verity_io_hash_req(v, io);
-out:
+ desc->tfm = v->shash_tfm;
+ r = crypto_shash_import(desc, v->initial_hashstate) ?:
+ crypto_shash_finup(desc, data, len, digest);
+ }
+ if (unlikely(r))
+ DMERR("Error hashing block: %d", r);
return r;
}
@@ -311,7 +321,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
}
} else {
data = dm_bufio_read_with_ioprio(v->bufio, hash_block,
- &buf, bio_prio(bio));
+ &buf, bio->bi_ioprio);
}
if (IS_ERR(data))
@@ -325,8 +335,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
goto release_ret_r;
}
- r = verity_hash(v, verity_io_hash_req(v, io),
- data, 1 << v->hash_dev_block_bits,
+ r = verity_hash(v, io, data, 1 << v->hash_dev_block_bits,
verity_io_real_digest(v, io), !io->in_bh);
if (unlikely(r < 0))
goto release_ret_r;
@@ -342,14 +351,14 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
r = -EAGAIN;
goto release_ret_r;
} else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA,
- hash_block, data, NULL) == 0)
+ hash_block, data) == 0)
aux->hash_verified = 1;
else if (verity_handle_err(v,
DM_VERITY_BLOCK_TYPE_METADATA,
hash_block)) {
- struct bio *bio =
- dm_bio_from_per_bio_data(io,
- v->ti->per_io_data_size);
+ struct bio *bio;
+ io->had_mismatch = true;
+ bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
dm_audit_log_bio(DM_MSG_PREFIX, "verify-metadata", bio,
block, 0);
r = -EIO;
@@ -404,98 +413,8 @@ out:
return r;
}
-/*
- * Calculates the digest for the given bio
- */
-static int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io,
- struct bvec_iter *iter, struct crypto_wait *wait)
-{
- unsigned int todo = 1 << v->data_dev_block_bits;
- struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
- struct scatterlist sg;
- struct ahash_request *req = verity_io_hash_req(v, io);
-
- do {
- int r;
- unsigned int len;
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- sg_init_table(&sg, 1);
-
- len = bv.bv_len;
-
- if (likely(len >= todo))
- len = todo;
- /*
- * Operating on a single page at a time looks suboptimal
- * until you consider the typical block size is 4,096B.
- * Going through this loops twice should be very rare.
- */
- sg_set_page(&sg, bv.bv_page, len, bv.bv_offset);
- ahash_request_set_crypt(req, &sg, NULL, len);
- r = crypto_wait_req(crypto_ahash_update(req), wait);
-
- if (unlikely(r < 0)) {
- DMERR("%s crypto op failed: %d", __func__, r);
- return r;
- }
-
- bio_advance_iter(bio, iter, len);
- todo -= len;
- } while (todo);
-
- return 0;
-}
-
-/*
- * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec
- * starting from iter.
- */
-int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
- struct bvec_iter *iter,
- int (*process)(struct dm_verity *v,
- struct dm_verity_io *io, u8 *data,
- size_t len))
-{
- unsigned int todo = 1 << v->data_dev_block_bits;
- struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
-
- do {
- int r;
- u8 *page;
- unsigned int len;
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- page = bvec_kmap_local(&bv);
- len = bv.bv_len;
-
- if (likely(len >= todo))
- len = todo;
-
- r = process(v, io, page, len);
- kunmap_local(page);
-
- if (r < 0)
- return r;
-
- bio_advance_iter(bio, iter, len);
- todo -= len;
- } while (todo);
-
- return 0;
-}
-
-static int verity_recheck_copy(struct dm_verity *v, struct dm_verity_io *io,
- u8 *data, size_t len)
-{
- memcpy(data, io->recheck_buffer, len);
- io->recheck_buffer += len;
-
- return 0;
-}
-
static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
- struct bvec_iter start, sector_t cur_block)
+ sector_t cur_block, u8 *dest)
{
struct page *page;
void *buffer;
@@ -518,8 +437,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
if (unlikely(r))
goto free_ret;
- r = verity_hash(v, verity_io_hash_req(v, io), buffer,
- 1 << v->data_dev_block_bits,
+ r = verity_hash(v, io, buffer, 1 << v->data_dev_block_bits,
verity_io_real_digest(v, io), true);
if (unlikely(r))
goto free_ret;
@@ -530,11 +448,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
goto free_ret;
}
- io->recheck_buffer = buffer;
- r = verity_for_bv_block(v, io, &start, verity_recheck_copy);
- if (unlikely(r))
- goto free_ret;
-
+ memcpy(dest, buffer, 1 << v->data_dev_block_bits);
r = 0;
free_ret:
mempool_free(page, &v->recheck_pool);
@@ -542,23 +456,37 @@ free_ret:
return r;
}
-static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
- u8 *data, size_t len)
-{
- memset(data, 0, len);
- return 0;
-}
-
-/*
- * Moves the bio iter one data block forward.
- */
-static inline void verity_bv_skip_block(struct dm_verity *v,
- struct dm_verity_io *io,
- struct bvec_iter *iter)
+static int verity_handle_data_hash_mismatch(struct dm_verity *v,
+ struct dm_verity_io *io,
+ struct bio *bio, sector_t blkno,
+ u8 *data)
{
- struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
+ if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
+ /*
+ * Error handling code (FEC included) cannot be run in the
+ * BH workqueue, so fallback to a standard workqueue.
+ */
+ return -EAGAIN;
+ }
+ if (verity_recheck(v, io, blkno, data) == 0) {
+ if (v->validated_blocks)
+ set_bit(blkno, v->validated_blocks);
+ return 0;
+ }
+#if defined(CONFIG_DM_VERITY_FEC)
+ if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, blkno,
+ data) == 0)
+ return 0;
+#endif
+ if (bio->bi_status)
+ return -EIO; /* Error correction failed; Just return error */
- bio_advance_iter(bio, iter, 1 << v->data_dev_block_bits);
+ if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, blkno)) {
+ io->had_mismatch = true;
+ dm_audit_log_bio(DM_MSG_PREFIX, "verify-data", bio, blkno, 0);
+ return -EIO;
+ }
+ return 0;
}
/*
@@ -566,12 +494,10 @@ static inline void verity_bv_skip_block(struct dm_verity *v,
*/
static int verity_verify_io(struct dm_verity_io *io)
{
- bool is_zero;
struct dm_verity *v = io->v;
- struct bvec_iter start;
+ const unsigned int block_size = 1 << v->data_dev_block_bits;
struct bvec_iter iter_copy;
struct bvec_iter *iter;
- struct crypto_wait wait;
struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
unsigned int b;
@@ -585,16 +511,17 @@ static int verity_verify_io(struct dm_verity_io *io)
} else
iter = &io->iter;
- for (b = 0; b < io->n_blocks; b++) {
+ for (b = 0; b < io->n_blocks;
+ b++, bio_advance_iter(bio, iter, block_size)) {
int r;
sector_t cur_block = io->block + b;
- struct ahash_request *req = verity_io_hash_req(v, io);
+ bool is_zero;
+ struct bio_vec bv;
+ void *data;
if (v->validated_blocks && bio->bi_status == BLK_STS_OK &&
- likely(test_bit(cur_block, v->validated_blocks))) {
- verity_bv_skip_block(v, io, iter);
+ likely(test_bit(cur_block, v->validated_blocks)))
continue;
- }
r = verity_hash_for_block(v, io, cur_block,
verity_io_want_digest(v, io),
@@ -602,67 +529,49 @@ static int verity_verify_io(struct dm_verity_io *io)
if (unlikely(r < 0))
return r;
+ bv = bio_iter_iovec(bio, *iter);
+ if (unlikely(bv.bv_len < block_size)) {
+ /*
+ * Data block spans pages. This should not happen,
+ * since dm-verity sets dma_alignment to the data block
+ * size minus 1, and dm-verity also doesn't allow the
+ * data block size to be greater than PAGE_SIZE.
+ */
+ DMERR_LIMIT("unaligned io (data block spans pages)");
+ return -EIO;
+ }
+
+ data = bvec_kmap_local(&bv);
+
if (is_zero) {
/*
* If we expect a zero block, don't validate, just
* return zeros.
*/
- r = verity_for_bv_block(v, io, iter,
- verity_bv_zero);
- if (unlikely(r < 0))
- return r;
-
+ memset(data, 0, block_size);
+ kunmap_local(data);
continue;
}
- r = verity_hash_init(v, req, &wait, !io->in_bh);
- if (unlikely(r < 0))
- return r;
-
- start = *iter;
- r = verity_for_io_block(v, io, iter, &wait);
- if (unlikely(r < 0))
- return r;
-
- r = verity_hash_final(v, req, verity_io_real_digest(v, io),
- &wait);
- if (unlikely(r < 0))
+ r = verity_hash(v, io, data, block_size,
+ verity_io_real_digest(v, io), !io->in_bh);
+ if (unlikely(r < 0)) {
+ kunmap_local(data);
return r;
+ }
if (likely(memcmp(verity_io_real_digest(v, io),
verity_io_want_digest(v, io), v->digest_size) == 0)) {
if (v->validated_blocks)
set_bit(cur_block, v->validated_blocks);
+ kunmap_local(data);
continue;
- } else if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
- /*
- * Error handling code (FEC included) cannot be run in a
- * tasklet since it may sleep, so fallback to work-queue.
- */
- return -EAGAIN;
- } else if (verity_recheck(v, io, start, cur_block) == 0) {
- if (v->validated_blocks)
- set_bit(cur_block, v->validated_blocks);
- continue;
-#if defined(CONFIG_DM_VERITY_FEC)
- } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
- cur_block, NULL, &start) == 0) {
- continue;
-#endif
- } else {
- if (bio->bi_status) {
- /*
- * Error correction failed; Just return error
- */
- return -EIO;
- }
- if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
- cur_block)) {
- dm_audit_log_bio(DM_MSG_PREFIX, "verify-data",
- bio, cur_block, 0);
- return -EIO;
- }
}
+ r = verity_handle_data_hash_mismatch(v, io, bio, cur_block,
+ data);
+ kunmap_local(data);
+ if (unlikely(r))
+ return r;
}
return 0;
@@ -677,6 +586,11 @@ static inline bool verity_is_system_shutting_down(void)
|| system_state == SYSTEM_RESTART;
}
+static void restart_io_error(struct work_struct *w)
+{
+ kernel_restart("dm-verity device has I/O error");
+}
+
/*
* End one "io" structure with a given error.
*/
@@ -691,6 +605,24 @@ static void verity_finish_io(struct dm_verity_io *io, blk_status_t status)
if (!static_branch_unlikely(&use_bh_wq_enabled) || !io->in_bh)
verity_fec_finish_io(io);
+ if (unlikely(status != BLK_STS_OK) &&
+ unlikely(!(bio->bi_opf & REQ_RAHEAD)) &&
+ !io->had_mismatch &&
+ !verity_is_system_shutting_down()) {
+ if (v->error_mode == DM_VERITY_MODE_PANIC) {
+ panic("dm-verity device has I/O error");
+ }
+ if (v->error_mode == DM_VERITY_MODE_RESTART) {
+ static DECLARE_WORK(restart_work, restart_io_error);
+ queue_work(v->verify_wq, &restart_work);
+ /*
+ * We deliberately don't call bio_endio here, because
+ * the machine will be restarted anyway.
+ */
+ return;
+ }
+ }
+
bio_endio(bio);
}
@@ -849,6 +781,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
io->orig_bi_end_io = bio->bi_end_io;
io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits;
+ io->had_mismatch = false;
bio->bi_end_io = verity_end_io;
bio->bi_private = io;
@@ -856,7 +789,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
verity_fec_init_io(io);
- verity_submit_prefetch(v, io, bio_prio(bio));
+ verity_submit_prefetch(v, io, bio->bi_ioprio);
submit_bio_noacct(bio);
@@ -899,6 +832,8 @@ static void verity_status(struct dm_target *ti, status_type_t type,
DMEMIT("%02x", v->salt[x]);
if (v->mode != DM_VERITY_MODE_EIO)
args++;
+ if (v->error_mode != DM_VERITY_MODE_EIO)
+ args++;
if (verity_fec_is_enabled(v))
args += DM_VERITY_OPTS_FEC;
if (v->zero_digest)
@@ -928,6 +863,19 @@ static void verity_status(struct dm_target *ti, status_type_t type,
BUG();
}
}
+ if (v->error_mode != DM_VERITY_MODE_EIO) {
+ DMEMIT(" ");
+ switch (v->error_mode) {
+ case DM_VERITY_MODE_RESTART:
+ DMEMIT(DM_VERITY_OPT_ERROR_RESTART);
+ break;
+ case DM_VERITY_MODE_PANIC:
+ DMEMIT(DM_VERITY_OPT_ERROR_PANIC);
+ break;
+ default:
+ BUG();
+ }
+ }
if (v->zero_digest)
DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES);
if (v->validated_blocks)
@@ -980,6 +928,19 @@ static void verity_status(struct dm_target *ti, status_type_t type,
DMEMIT("invalid");
}
}
+ if (v->error_mode != DM_VERITY_MODE_EIO) {
+ DMEMIT(",verity_error_mode=");
+ switch (v->error_mode) {
+ case DM_VERITY_MODE_RESTART:
+ DMEMIT(DM_VERITY_OPT_ERROR_RESTART);
+ break;
+ case DM_VERITY_MODE_PANIC:
+ DMEMIT(DM_VERITY_OPT_ERROR_PANIC);
+ break;
+ default:
+ DMEMIT("invalid");
+ }
+ }
DMEMIT(";");
break;
}
@@ -991,7 +952,7 @@ static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
*bdev = v->data_dev->bdev;
- if (v->data_start || ti->len != bdev_nr_sectors(v->data_dev->bdev))
+ if (ti->len != bdev_nr_sectors(v->data_dev->bdev))
return 1;
return 0;
}
@@ -1001,7 +962,7 @@ static int verity_iterate_devices(struct dm_target *ti,
{
struct dm_verity *v = ti->private;
- return fn(ti, v->data_dev, v->data_start, ti->len, data);
+ return fn(ti, v->data_dev, 0, ti->len, data);
}
static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -1014,9 +975,52 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
if (limits->physical_block_size < 1 << v->data_dev_block_bits)
limits->physical_block_size = 1 << v->data_dev_block_bits;
- blk_limits_io_min(limits, limits->logical_block_size);
+ limits->io_min = limits->logical_block_size;
+
+ /*
+ * Similar to what dm-crypt does, opt dm-verity out of support for
+ * direct I/O that is aligned to less than the traditional direct I/O
+ * alignment requirement of logical_block_size. This prevents dm-verity
+ * data blocks from crossing pages, eliminating various edge cases.
+ */
+ limits->dma_alignment = limits->logical_block_size - 1;
}
+#ifdef CONFIG_SECURITY
+
+static int verity_init_sig(struct dm_verity *v, const void *sig,
+ size_t sig_size)
+{
+ v->sig_size = sig_size;
+
+ if (sig) {
+ v->root_digest_sig = kmemdup(sig, v->sig_size, GFP_KERNEL);
+ if (!v->root_digest_sig)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void verity_free_sig(struct dm_verity *v)
+{
+ kfree(v->root_digest_sig);
+}
+
+#else
+
+static inline int verity_init_sig(struct dm_verity *v, const void *sig,
+ size_t sig_size)
+{
+ return 0;
+}
+
+static inline void verity_free_sig(struct dm_verity *v)
+{
+}
+
+#endif /* CONFIG_SECURITY */
+
static void verity_dtr(struct dm_target *ti)
{
struct dm_verity *v = ti->private;
@@ -1033,11 +1037,17 @@ static void verity_dtr(struct dm_target *ti)
kvfree(v->validated_blocks);
kfree(v->salt);
+ kfree(v->initial_hashstate);
kfree(v->root_digest);
kfree(v->zero_digest);
+ verity_free_sig(v);
- if (v->tfm)
- crypto_free_ahash(v->tfm);
+ if (v->ahash_tfm) {
+ static_branch_dec(&ahash_enabled);
+ crypto_free_ahash(v->ahash_tfm);
+ } else {
+ crypto_free_shash(v->shash_tfm);
+ }
kfree(v->alg_name);
@@ -1083,7 +1093,7 @@ static int verity_alloc_most_once(struct dm_verity *v)
static int verity_alloc_zero_digest(struct dm_verity *v)
{
int r = -ENOMEM;
- struct ahash_request *req;
+ struct dm_verity_io *io;
u8 *zero_data;
v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
@@ -1091,9 +1101,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
if (!v->zero_digest)
return r;
- req = kmalloc(v->ahash_reqsize, GFP_KERNEL);
+ io = kmalloc(sizeof(*io) + v->hash_reqsize, GFP_KERNEL);
- if (!req)
+ if (!io)
return r; /* verity_dtr will free zero_digest */
zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL);
@@ -1101,11 +1111,11 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
if (!zero_data)
goto out;
- r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits,
+ r = verity_hash(v, io, zero_data, 1 << v->data_dev_block_bits,
v->zero_digest, true);
out:
- kfree(req);
+ kfree(io);
kfree(zero_data);
return r;
@@ -1133,6 +1143,25 @@ static int verity_parse_verity_mode(struct dm_verity *v, const char *arg_name)
return 0;
}
+static inline bool verity_is_verity_error_mode(const char *arg_name)
+{
+ return (!strcasecmp(arg_name, DM_VERITY_OPT_ERROR_RESTART) ||
+ !strcasecmp(arg_name, DM_VERITY_OPT_ERROR_PANIC));
+}
+
+static int verity_parse_verity_error_mode(struct dm_verity *v, const char *arg_name)
+{
+ if (v->error_mode)
+ return -EINVAL;
+
+ if (!strcasecmp(arg_name, DM_VERITY_OPT_ERROR_RESTART))
+ v->error_mode = DM_VERITY_MODE_RESTART;
+ else if (!strcasecmp(arg_name, DM_VERITY_OPT_ERROR_PANIC))
+ v->error_mode = DM_VERITY_MODE_PANIC;
+
+ return 0;
+}
+
static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
struct dm_verity_sig_opts *verify_args,
bool only_modifier_opts)
@@ -1167,6 +1196,16 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
}
continue;
+ } else if (verity_is_verity_error_mode(arg_name)) {
+ if (only_modifier_opts)
+ continue;
+ r = verity_parse_verity_error_mode(v, arg_name);
+ if (r) {
+ ti->error = "Conflicting error handling parameters";
+ return r;
+ }
+ continue;
+
} else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) {
if (only_modifier_opts)
continue;
@@ -1226,6 +1265,113 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
return r;
}
+static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name)
+{
+ struct dm_target *ti = v->ti;
+ struct crypto_ahash *ahash;
+ struct crypto_shash *shash = NULL;
+ const char *driver_name;
+
+ v->alg_name = kstrdup(alg_name, GFP_KERNEL);
+ if (!v->alg_name) {
+ ti->error = "Cannot allocate algorithm name";
+ return -ENOMEM;
+ }
+
+ /*
+ * Allocate the hash transformation object that this dm-verity instance
+ * will use. The vast majority of dm-verity users use CPU-based
+ * hashing, so when possible use the shash API to minimize the crypto
+ * API overhead. If the ahash API resolves to a different driver
+ * (likely an off-CPU hardware offload), use ahash instead. Also use
+ * ahash if the obsolete dm-verity format with the appended salt is
+ * being used, so that quirk only needs to be handled in one place.
+ */
+ ahash = crypto_alloc_ahash(alg_name, 0,
+ v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0);
+ if (IS_ERR(ahash)) {
+ ti->error = "Cannot initialize hash function";
+ return PTR_ERR(ahash);
+ }
+ driver_name = crypto_ahash_driver_name(ahash);
+ if (v->version >= 1 /* salt prepended, not appended? */) {
+ shash = crypto_alloc_shash(alg_name, 0, 0);
+ if (!IS_ERR(shash) &&
+ strcmp(crypto_shash_driver_name(shash), driver_name) != 0) {
+ /*
+ * ahash gave a different driver than shash, so probably
+ * this is a case of real hardware offload. Use ahash.
+ */
+ crypto_free_shash(shash);
+ shash = NULL;
+ }
+ }
+ if (!IS_ERR_OR_NULL(shash)) {
+ crypto_free_ahash(ahash);
+ ahash = NULL;
+ v->shash_tfm = shash;
+ v->digest_size = crypto_shash_digestsize(shash);
+ v->hash_reqsize = sizeof(struct shash_desc) +
+ crypto_shash_descsize(shash);
+ DMINFO("%s using shash \"%s\"", alg_name, driver_name);
+ } else {
+ v->ahash_tfm = ahash;
+ static_branch_inc(&ahash_enabled);
+ v->digest_size = crypto_ahash_digestsize(ahash);
+ v->hash_reqsize = sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(ahash);
+ DMINFO("%s using ahash \"%s\"", alg_name, driver_name);
+ }
+ if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
+ ti->error = "Digest size too big";
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg)
+{
+ struct dm_target *ti = v->ti;
+
+ if (strcmp(arg, "-") != 0) {
+ v->salt_size = strlen(arg) / 2;
+ v->salt = kmalloc(v->salt_size, GFP_KERNEL);
+ if (!v->salt) {
+ ti->error = "Cannot allocate salt";
+ return -ENOMEM;
+ }
+ if (strlen(arg) != v->salt_size * 2 ||
+ hex2bin(v->salt, arg, v->salt_size)) {
+ ti->error = "Invalid salt";
+ return -EINVAL;
+ }
+ }
+ if (v->shash_tfm) {
+ SHASH_DESC_ON_STACK(desc, v->shash_tfm);
+ int r;
+
+ /*
+ * Compute the pre-salted hash state that can be passed to
+ * crypto_shash_import() for each block later.
+ */
+ v->initial_hashstate = kmalloc(
+ crypto_shash_statesize(v->shash_tfm), GFP_KERNEL);
+ if (!v->initial_hashstate) {
+ ti->error = "Cannot allocate initial hash state";
+ return -ENOMEM;
+ }
+ desc->tfm = v->shash_tfm;
+ r = crypto_shash_init(desc) ?:
+ crypto_shash_update(desc, v->salt, v->salt_size) ?:
+ crypto_shash_export(desc, v->initial_hashstate);
+ if (r) {
+ ti->error = "Cannot set up initial hash state";
+ return r;
+ }
+ }
+ return 0;
+}
+
/*
* Target parameters:
* <version> The current format is version 1.
@@ -1350,38 +1496,9 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
v->hash_start = num_ll;
- v->alg_name = kstrdup(argv[7], GFP_KERNEL);
- if (!v->alg_name) {
- ti->error = "Cannot allocate algorithm name";
- r = -ENOMEM;
- goto bad;
- }
-
- v->tfm = crypto_alloc_ahash(v->alg_name, 0,
- v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0);
- if (IS_ERR(v->tfm)) {
- ti->error = "Cannot initialize hash function";
- r = PTR_ERR(v->tfm);
- v->tfm = NULL;
- goto bad;
- }
-
- /*
- * dm-verity performance can vary greatly depending on which hash
- * algorithm implementation is used. Help people debug performance
- * problems by logging the ->cra_driver_name.
- */
- DMINFO("%s using implementation \"%s\"", v->alg_name,
- crypto_hash_alg_common(v->tfm)->base.cra_driver_name);
-
- v->digest_size = crypto_ahash_digestsize(v->tfm);
- if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
- ti->error = "Digest size too big";
- r = -EINVAL;
+ r = verity_setup_hash_alg(v, argv[7]);
+ if (r)
goto bad;
- }
- v->ahash_reqsize = sizeof(struct ahash_request) +
- crypto_ahash_reqsize(v->tfm);
v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
if (!v->root_digest) {
@@ -1397,21 +1514,9 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
root_hash_digest_to_validate = argv[8];
- if (strcmp(argv[9], "-")) {
- v->salt_size = strlen(argv[9]) / 2;
- v->salt = kmalloc(v->salt_size, GFP_KERNEL);
- if (!v->salt) {
- ti->error = "Cannot allocate salt";
- r = -ENOMEM;
- goto bad;
- }
- if (strlen(argv[9]) != v->salt_size * 2 ||
- hex2bin(v->salt, argv[9], v->salt_size)) {
- ti->error = "Invalid salt";
- r = -EINVAL;
- goto bad;
- }
- }
+ r = verity_setup_salt_and_hashstate(v, argv[9]);
+ if (r)
+ goto bad;
argv += 10;
argc -= 10;
@@ -1434,6 +1539,13 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->error = "Root hash verification failed";
goto bad;
}
+
+ r = verity_init_sig(v, verify_args.sig, verify_args.sig_size);
+ if (r < 0) {
+ ti->error = "Cannot allocate root digest signature";
+ goto bad;
+ }
+
v->hash_per_block_bits =
__fls((1 << v->hash_dev_block_bits) / v->digest_size);
@@ -1513,8 +1625,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- ti->per_io_data_size = sizeof(struct dm_verity_io) +
- v->ahash_reqsize + v->digest_size * 2;
+ ti->per_io_data_size = sizeof(struct dm_verity_io) + v->hash_reqsize;
r = verity_fec_ctr(v);
if (r)
@@ -1539,14 +1650,6 @@ bad:
}
/*
- * Check whether a DM target is a verity target.
- */
-bool dm_is_verity_target(struct dm_target *ti)
-{
- return ti->type->module == THIS_MODULE;
-}
-
-/*
* Get the verity mode (error behavior) of a verity target.
*
* Returns the verity mode of the target, or -EINVAL if 'ti' is not a verity
@@ -1584,8 +1687,79 @@ int dm_verity_get_root_digest(struct dm_target *ti, u8 **root_digest, unsigned i
return 0;
}
+#ifdef CONFIG_SECURITY
+
+#ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG
+
+static int verity_security_set_signature(struct block_device *bdev,
+ struct dm_verity *v)
+{
+ /*
+ * if the dm-verity target is unsigned, v->root_digest_sig will
+ * be NULL, and the hook call is still required to let LSMs mark
+ * the device as unsigned. This information is crucial for LSMs to
+ * block operations such as execution on unsigned files
+ */
+ return security_bdev_setintegrity(bdev,
+ LSM_INT_DMVERITY_SIG_VALID,
+ v->root_digest_sig,
+ v->sig_size);
+}
+
+#else
+
+static inline int verity_security_set_signature(struct block_device *bdev,
+ struct dm_verity *v)
+{
+ return 0;
+}
+
+#endif /* CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG */
+
+/*
+ * Expose verity target's root hash and signature data to LSMs before resume.
+ *
+ * Returns 0 on success, or -ENOMEM if the system is out of memory.
+ */
+static int verity_preresume(struct dm_target *ti)
+{
+ struct block_device *bdev;
+ struct dm_verity_digest root_digest;
+ struct dm_verity *v;
+ int r;
+
+ v = ti->private;
+ bdev = dm_disk(dm_table_get_md(ti->table))->part0;
+ root_digest.digest = v->root_digest;
+ root_digest.digest_len = v->digest_size;
+ if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm)
+ root_digest.alg = crypto_ahash_alg_name(v->ahash_tfm);
+ else
+ root_digest.alg = crypto_shash_alg_name(v->shash_tfm);
+
+ r = security_bdev_setintegrity(bdev, LSM_INT_DMVERITY_ROOTHASH, &root_digest,
+ sizeof(root_digest));
+ if (r)
+ return r;
+
+ r = verity_security_set_signature(bdev, v);
+ if (r)
+ goto bad;
+
+ return 0;
+
+bad:
+
+ security_bdev_setintegrity(bdev, LSM_INT_DMVERITY_ROOTHASH, NULL, 0);
+
+ return r;
+}
+
+#endif /* CONFIG_SECURITY */
+
static struct target_type verity_target = {
.name = "verity",
+/* Note: the LSMs depend on the singleton and immutable features */
.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
.version = {1, 10, 0},
.module = THIS_MODULE,
@@ -1596,9 +1770,20 @@ static struct target_type verity_target = {
.prepare_ioctl = verity_prepare_ioctl,
.iterate_devices = verity_iterate_devices,
.io_hints = verity_io_hints,
+#ifdef CONFIG_SECURITY
+ .preresume = verity_preresume,
+#endif /* CONFIG_SECURITY */
};
module_dm(verity);
+/*
+ * Check whether a DM target is a verity target.
+ */
+bool dm_is_verity_target(struct dm_target *ti)
+{
+ return ti->type == &verity_target;
+}
+
MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c
index 4836508ea50c..a9e2c6c0a33c 100644
--- a/drivers/md/dm-verity-verify-sig.c
+++ b/drivers/md/dm-verity-verify-sig.c
@@ -126,6 +126,13 @@ int verity_verify_root_hash(const void *root_hash, size_t root_hash_len,
NULL,
#endif
VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL);
+#ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING
+ if (ret == -ENOKEY || ret == -EKEYREJECTED)
+ ret = verify_pkcs7_signature(root_hash, root_hash_len, sig_data,
+ sig_len,
+ VERIFY_USE_PLATFORM_KEYRING,
+ VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL);
+#endif
return ret;
}
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 20b1bcf03474..8cbb57862ae1 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -39,12 +39,17 @@ struct dm_verity {
struct dm_target *ti;
struct dm_bufio_client *bufio;
char *alg_name;
- struct crypto_ahash *tfm;
+ struct crypto_ahash *ahash_tfm; /* either this or shash_tfm is set */
+ struct crypto_shash *shash_tfm; /* either this or ahash_tfm is set */
u8 *root_digest; /* digest of the root block */
u8 *salt; /* salt: its size is salt_size */
+ u8 *initial_hashstate; /* salted initial state, if shash_tfm is set */
u8 *zero_digest; /* digest for a zero block */
+#ifdef CONFIG_SECURITY
+ u8 *root_digest_sig; /* signature of the root digest */
+ unsigned int sig_size; /* root digest signature size */
+#endif /* CONFIG_SECURITY */
unsigned int salt_size;
- sector_t data_start; /* data offset in 512-byte sectors */
sector_t hash_start; /* hash start in blocks */
sector_t data_blocks; /* the number of data blocks */
sector_t hash_blocks; /* the number of hash blocks */
@@ -56,8 +61,9 @@ struct dm_verity {
bool hash_failed:1; /* set if hash of any block failed */
bool use_bh_wq:1; /* try to verify in BH wq before normal work-queue */
unsigned int digest_size; /* digest size for the current hash algorithm */
- unsigned int ahash_reqsize;/* the size of temporary space for crypto */
+ unsigned int hash_reqsize; /* the size of temporary space for crypto */
enum verity_mode mode; /* mode for handling verification errors */
+ enum verity_mode error_mode;/* mode for handling I/O errors */
unsigned int corrupted_errs;/* Number of errors for corrupted blocks */
struct workqueue_struct *verify_wq;
@@ -85,49 +91,41 @@ struct dm_verity_io {
sector_t block;
unsigned int n_blocks;
bool in_bh;
+ bool had_mismatch;
struct work_struct work;
struct work_struct bh_work;
- char *recheck_buffer;
+ u8 real_digest[HASH_MAX_DIGESTSIZE];
+ u8 want_digest[HASH_MAX_DIGESTSIZE];
/*
- * Three variably-size fields follow this struct:
- *
- * u8 hash_req[v->ahash_reqsize];
- * u8 real_digest[v->digest_size];
- * u8 want_digest[v->digest_size];
- *
- * To access them use: verity_io_hash_req(), verity_io_real_digest()
- * and verity_io_want_digest().
+ * This struct is followed by a variable-sized hash request of size
+ * v->hash_reqsize, either a struct ahash_request or a struct shash_desc
+ * (depending on whether ahash_tfm or shash_tfm is being used). To
+ * access it, use verity_io_hash_req().
*/
};
-static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v,
- struct dm_verity_io *io)
+static inline void *verity_io_hash_req(struct dm_verity *v,
+ struct dm_verity_io *io)
{
- return (struct ahash_request *)(io + 1);
+ return io + 1;
}
static inline u8 *verity_io_real_digest(struct dm_verity *v,
struct dm_verity_io *io)
{
- return (u8 *)(io + 1) + v->ahash_reqsize;
+ return io->real_digest;
}
static inline u8 *verity_io_want_digest(struct dm_verity *v,
struct dm_verity_io *io)
{
- return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size;
+ return io->want_digest;
}
-extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
- struct bvec_iter *iter,
- int (*process)(struct dm_verity *v,
- struct dm_verity_io *io,
- u8 *data, size_t len));
-
-extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
+extern int verity_hash(struct dm_verity *v, struct dm_verity_io *io,
const u8 *data, size_t len, u8 *digest, bool may_sleep);
extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 3b13e6eb1aa4..9a0bb623e823 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -61,7 +61,6 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
static void zero_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
- limits->max_discard_sectors = UINT_MAX;
limits->max_hw_discard_sectors = UINT_MAX;
limits->discard_granularity = 512;
}
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index eb9832b22b14..20edd3fabbab 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -13,8 +13,6 @@
#define DM_MSG_PREFIX "zone"
-#define DM_ZONE_INVALID_WP_OFST UINT_MAX
-
/*
* For internal zone reports bypassing the top BIO submission path.
*/
@@ -60,16 +58,23 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
struct dm_table *map;
int srcu_idx, ret;
- if (dm_suspended_md(md))
- return -EAGAIN;
+ if (!md->zone_revalidate_map) {
+ /* Regular user context */
+ if (dm_suspended_md(md))
+ return -EAGAIN;
- map = dm_get_live_table(md, &srcu_idx);
- if (!map)
- return -EIO;
+ map = dm_get_live_table(md, &srcu_idx);
+ if (!map)
+ return -EIO;
+ } else {
+ /* Zone revalidation during __bind() */
+ map = md->zone_revalidate_map;
+ }
ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
- dm_put_live_table(md, srcu_idx);
+ if (!md->zone_revalidate_map)
+ dm_put_live_table(md, srcu_idx);
return ret;
}
@@ -138,129 +143,49 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
}
}
-void dm_cleanup_zoned_dev(struct mapped_device *md)
-{
- if (md->disk) {
- bitmap_free(md->disk->conv_zones_bitmap);
- md->disk->conv_zones_bitmap = NULL;
- bitmap_free(md->disk->seq_zones_wlock);
- md->disk->seq_zones_wlock = NULL;
- }
-
- kvfree(md->zwp_offset);
- md->zwp_offset = NULL;
- md->nr_zones = 0;
-}
-
-static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
-{
- switch (zone->cond) {
- case BLK_ZONE_COND_IMP_OPEN:
- case BLK_ZONE_COND_EXP_OPEN:
- case BLK_ZONE_COND_CLOSED:
- return zone->wp - zone->start;
- case BLK_ZONE_COND_FULL:
- return zone->len;
- case BLK_ZONE_COND_EMPTY:
- case BLK_ZONE_COND_NOT_WP:
- case BLK_ZONE_COND_OFFLINE:
- case BLK_ZONE_COND_READONLY:
- default:
- /*
- * Conventional, offline and read-only zones do not have a valid
- * write pointer. Use 0 as for an empty zone.
- */
- return 0;
- }
-}
-
-static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
-{
- struct mapped_device *md = data;
- struct gendisk *disk = md->disk;
-
- switch (zone->type) {
- case BLK_ZONE_TYPE_CONVENTIONAL:
- if (!disk->conv_zones_bitmap) {
- disk->conv_zones_bitmap = bitmap_zalloc(disk->nr_zones,
- GFP_NOIO);
- if (!disk->conv_zones_bitmap)
- return -ENOMEM;
- }
- set_bit(idx, disk->conv_zones_bitmap);
- break;
- case BLK_ZONE_TYPE_SEQWRITE_REQ:
- case BLK_ZONE_TYPE_SEQWRITE_PREF:
- if (!disk->seq_zones_wlock) {
- disk->seq_zones_wlock = bitmap_zalloc(disk->nr_zones,
- GFP_NOIO);
- if (!disk->seq_zones_wlock)
- return -ENOMEM;
- }
- if (!md->zwp_offset) {
- md->zwp_offset =
- kvcalloc(disk->nr_zones, sizeof(unsigned int),
- GFP_KERNEL);
- if (!md->zwp_offset)
- return -ENOMEM;
- }
- md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
-
- break;
- default:
- DMERR("Invalid zone type 0x%x at sectors %llu",
- (int)zone->type, zone->start);
- return -ENODEV;
- }
-
- return 0;
-}
-
/*
* Revalidate the zones of a mapped device to initialize resource necessary
* for zone append emulation. Note that we cannot simply use the block layer
* blk_revalidate_disk_zones() function here as the mapped device is suspended
* (this is called from __bind() context).
*/
-static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
+int dm_revalidate_zones(struct dm_table *t, struct request_queue *q)
{
+ struct mapped_device *md = t->md;
struct gendisk *disk = md->disk;
- unsigned int noio_flag;
int ret;
- /*
- * Check if something changed. If yes, cleanup the current resources
- * and reallocate everything.
- */
- if (!disk->nr_zones || disk->nr_zones != md->nr_zones)
- dm_cleanup_zoned_dev(md);
+ if (!get_capacity(disk))
+ return 0;
+
+ /* Revalidate only if something changed. */
+ if (!disk->nr_zones || disk->nr_zones != md->nr_zones) {
+ DMINFO("%s using %s zone append",
+ disk->disk_name,
+ queue_emulates_zone_append(q) ? "emulated" : "native");
+ md->nr_zones = 0;
+ }
+
if (md->nr_zones)
return 0;
/*
- * Scan all zones to initialize everything. Ensure that all vmalloc
- * operations in this context are done as if GFP_NOIO was specified.
+ * Our table is not live yet. So the call to dm_get_live_table()
+ * in dm_blk_report_zones() will fail. Set a temporary pointer to
+ * our table for dm_blk_report_zones() to use directly.
*/
- noio_flag = memalloc_noio_save();
- ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones,
- dm_zone_revalidate_cb, md);
- memalloc_noio_restore(noio_flag);
- if (ret < 0)
- goto err;
- if (ret != disk->nr_zones) {
- ret = -EIO;
- goto err;
+ md->zone_revalidate_map = t;
+ ret = blk_revalidate_disk_zones(disk);
+ md->zone_revalidate_map = NULL;
+
+ if (ret) {
+ DMERR("Revalidate zones failed %d", ret);
+ return ret;
}
md->nr_zones = disk->nr_zones;
return 0;
-
-err:
- DMERR("Revalidate zones failed %d", ret);
- dm_cleanup_zoned_dev(md);
- return ret;
}
static int device_not_zone_append_capable(struct dm_target *ti,
@@ -286,297 +211,201 @@ static bool dm_table_supports_zone_append(struct dm_table *t)
return true;
}
-int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
-{
- struct mapped_device *md = t->md;
-
- /*
- * For a zoned target, the number of zones should be updated for the
- * correct value to be exposed in sysfs queue/nr_zones.
- */
- WARN_ON_ONCE(queue_is_mq(q));
- md->disk->nr_zones = bdev_nr_zones(md->disk->part0);
-
- /* Check if zone append is natively supported */
- if (dm_table_supports_zone_append(t)) {
- clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
- dm_cleanup_zoned_dev(md);
- return 0;
- }
-
- /*
- * Mark the mapped device as needing zone append emulation and
- * initialize the emulation resources once the capacity is set.
- */
- set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
- if (!get_capacity(md->disk))
- return 0;
-
- return dm_revalidate_zones(md, t);
-}
+struct dm_device_zone_count {
+ sector_t start;
+ sector_t len;
+ unsigned int total_nr_seq_zones;
+ unsigned int target_nr_seq_zones;
+};
-static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
+/*
+ * Count the total number of and the number of mapped sequential zones of a
+ * target zoned device.
+ */
+static int dm_device_count_zones_cb(struct blk_zone *zone,
+ unsigned int idx, void *data)
{
- unsigned int *wp_offset = data;
+ struct dm_device_zone_count *zc = data;
- *wp_offset = dm_get_zone_wp_offset(zone);
+ if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
+ zc->total_nr_seq_zones++;
+ if (zone->start >= zc->start &&
+ zone->start < zc->start + zc->len)
+ zc->target_nr_seq_zones++;
+ }
return 0;
}
-static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
- unsigned int *wp_ofst)
+static int dm_device_count_zones(struct dm_dev *dev,
+ struct dm_device_zone_count *zc)
{
- sector_t sector = zno * bdev_zone_sectors(md->disk->part0);
- unsigned int noio_flag;
- struct dm_table *t;
- int srcu_idx, ret;
-
- t = dm_get_live_table(md, &srcu_idx);
- if (!t)
- return -EIO;
-
- /*
- * Ensure that all memory allocations in this context are done as if
- * GFP_NOIO was specified.
- */
- noio_flag = memalloc_noio_save();
- ret = dm_blk_do_report_zones(md, t, sector, 1,
- dm_update_zone_wp_offset_cb, wp_ofst);
- memalloc_noio_restore(noio_flag);
-
- dm_put_live_table(md, srcu_idx);
+ int ret;
- if (ret != 1)
+ ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES,
+ dm_device_count_zones_cb, zc);
+ if (ret < 0)
+ return ret;
+ if (!ret)
return -EIO;
-
return 0;
}
-struct orig_bio_details {
- enum req_op op;
- unsigned int nr_sectors;
+struct dm_zone_resource_limits {
+ unsigned int mapped_nr_seq_zones;
+ struct queue_limits *lim;
+ bool reliable_limits;
};
-/*
- * First phase of BIO mapping for targets with zone append emulation:
- * check all BIO that change a zone writer pointer and change zone
- * append operations into regular write operations.
- */
-static bool dm_zone_map_bio_begin(struct mapped_device *md,
- unsigned int zno, struct bio *clone)
+static int device_get_zone_resource_limits(struct dm_target *ti,
+ struct dm_dev *dev, sector_t start,
+ sector_t len, void *data)
{
- sector_t zsectors = bdev_zone_sectors(md->disk->part0);
- unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
+ struct dm_zone_resource_limits *zlim = data;
+ struct gendisk *disk = dev->bdev->bd_disk;
+ unsigned int max_open_zones, max_active_zones;
+ int ret;
+ struct dm_device_zone_count zc = {
+ .start = start,
+ .len = len,
+ };
/*
- * If the target zone is in an error state, recover by inspecting the
- * zone to get its current write pointer position. Note that since the
- * target zone is already locked, a BIO issuing context should never
- * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
+ * If the target is not the whole device, the device zone resources may
+ * be shared between different targets. Check this by counting the
+ * number of mapped sequential zones: if this number is smaller than the
+ * total number of sequential zones of the target device, then resource
+ * sharing may happen and the zone limits will not be reliable.
*/
- if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
- if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
- return false;
- WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
+ ret = dm_device_count_zones(dev, &zc);
+ if (ret) {
+ DMERR("Count %s zones failed %d", disk->disk_name, ret);
+ return ret;
}
- switch (bio_op(clone)) {
- case REQ_OP_ZONE_RESET:
- case REQ_OP_ZONE_FINISH:
- return true;
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE:
- /* Writes must be aligned to the zone write pointer */
- if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
- return false;
- break;
- case REQ_OP_ZONE_APPEND:
- /*
- * Change zone append operations into a non-mergeable regular
- * writes directed at the current write pointer position of the
- * target zone.
- */
- clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
- (clone->bi_opf & (~REQ_OP_MASK));
- clone->bi_iter.bi_sector += zwp_offset;
- break;
- default:
- DMWARN_LIMIT("Invalid BIO operation");
- return false;
- }
-
- /* Cannot write to a full zone */
- if (zwp_offset >= zsectors)
- return false;
-
- return true;
-}
+ /*
+ * If the target does not map any sequential zones, then we do not need
+ * any zone resource limits.
+ */
+ if (!zc.target_nr_seq_zones)
+ return 0;
-/*
- * Second phase of BIO mapping for targets with zone append emulation:
- * update the zone write pointer offset array to account for the additional
- * data written to a zone. Note that at this point, the remapped clone BIO
- * may already have completed, so we do not touch it.
- */
-static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno,
- struct orig_bio_details *orig_bio_details,
- unsigned int nr_sectors)
-{
- unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
-
- /* The clone BIO may already have been completed and failed */
- if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
- return BLK_STS_IOERR;
-
- /* Update the zone wp offset */
- switch (orig_bio_details->op) {
- case REQ_OP_ZONE_RESET:
- WRITE_ONCE(md->zwp_offset[zno], 0);
- return BLK_STS_OK;
- case REQ_OP_ZONE_FINISH:
- WRITE_ONCE(md->zwp_offset[zno],
- bdev_zone_sectors(md->disk->part0));
- return BLK_STS_OK;
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE:
- WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
- return BLK_STS_OK;
- case REQ_OP_ZONE_APPEND:
- /*
- * Check that the target did not truncate the write operation
- * emulating a zone append.
- */
- if (nr_sectors != orig_bio_details->nr_sectors) {
- DMWARN_LIMIT("Truncated write for zone append");
- return BLK_STS_IOERR;
- }
- WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
- return BLK_STS_OK;
- default:
- DMWARN_LIMIT("Invalid BIO operation");
- return BLK_STS_IOERR;
+ /*
+ * If the target does not map all sequential zones, the limits
+ * will not be reliable and we cannot use REQ_OP_ZONE_RESET_ALL.
+ */
+ if (zc.target_nr_seq_zones < zc.total_nr_seq_zones) {
+ zlim->reliable_limits = false;
+ ti->zone_reset_all_supported = false;
}
-}
-static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno,
- struct bio *clone)
-{
- if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
- return;
-
- wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
- bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
-}
-
-static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno,
- struct bio *clone)
-{
- if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
- return;
+ /*
+ * If the target maps less sequential zones than the limit values, then
+ * we do not have limits for this target.
+ */
+ max_active_zones = disk->queue->limits.max_active_zones;
+ if (max_active_zones >= zc.target_nr_seq_zones)
+ max_active_zones = 0;
+ zlim->lim->max_active_zones =
+ min_not_zero(max_active_zones, zlim->lim->max_active_zones);
+
+ max_open_zones = disk->queue->limits.max_open_zones;
+ if (max_open_zones >= zc.target_nr_seq_zones)
+ max_open_zones = 0;
+ zlim->lim->max_open_zones =
+ min_not_zero(max_open_zones, zlim->lim->max_open_zones);
- WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock));
- clear_bit_unlock(zno, disk->seq_zones_wlock);
- smp_mb__after_atomic();
- wake_up_bit(disk->seq_zones_wlock, zno);
+ /*
+ * Also count the total number of sequential zones for the mapped
+ * device so that when we are done inspecting all its targets, we are
+ * able to check if the mapped device actually has any sequential zones.
+ */
+ zlim->mapped_nr_seq_zones += zc.target_nr_seq_zones;
- bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
+ return 0;
}
-static bool dm_need_zone_wp_tracking(struct bio *bio)
+int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
+ struct queue_limits *lim)
{
+ struct mapped_device *md = t->md;
+ struct gendisk *disk = md->disk;
+ struct dm_zone_resource_limits zlim = {
+ .reliable_limits = true,
+ .lim = lim,
+ };
+
/*
- * Special processing is not needed for operations that do not need the
- * zone write lock, that is, all operations that target conventional
- * zones and all operations that do not modify directly a sequential
- * zone write pointer.
+ * Check if zone append is natively supported, and if not, set the
+ * mapped device queue as needing zone append emulation.
*/
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
- return false;
- switch (bio_op(bio)) {
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE:
- case REQ_OP_ZONE_RESET:
- case REQ_OP_ZONE_FINISH:
- case REQ_OP_ZONE_APPEND:
- return bio_zone_is_seq(bio);
- default:
- return false;
+ WARN_ON_ONCE(queue_is_mq(q));
+ if (dm_table_supports_zone_append(t)) {
+ clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ } else {
+ set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ lim->max_hw_zone_append_sectors = 0;
}
-}
-
-/*
- * Special IO mapping for targets needing zone append emulation.
- */
-int dm_zone_map_bio(struct dm_target_io *tio)
-{
- struct dm_io *io = tio->io;
- struct dm_target *ti = tio->ti;
- struct mapped_device *md = io->md;
- struct bio *clone = &tio->clone;
- struct orig_bio_details orig_bio_details;
- unsigned int zno;
- blk_status_t sts;
- int r;
/*
- * IOs that do not change a zone write pointer do not need
- * any additional special processing.
+ * Determine the max open and max active zone limits for the mapped
+ * device by inspecting the zone resource limits and the zones mapped
+ * by each target.
*/
- if (!dm_need_zone_wp_tracking(clone))
- return ti->type->map(ti, clone);
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
- /* Lock the target zone */
- zno = bio_zone_no(clone);
- dm_zone_lock(md->disk, zno, clone);
+ /*
+ * Assume that the target can accept REQ_OP_ZONE_RESET_ALL.
+ * device_get_zone_resource_limits() may adjust this if one of
+ * the device used by the target does not have all its
+ * sequential write required zones mapped.
+ */
+ ti->zone_reset_all_supported = true;
- orig_bio_details.nr_sectors = bio_sectors(clone);
- orig_bio_details.op = bio_op(clone);
+ if (!ti->type->iterate_devices ||
+ ti->type->iterate_devices(ti,
+ device_get_zone_resource_limits, &zlim)) {
+ DMERR("Could not determine %s zone resource limits",
+ disk->disk_name);
+ return -ENODEV;
+ }
+ }
/*
- * Check that the bio and the target zone write pointer offset are
- * both valid, and if the bio is a zone append, remap it to a write.
+ * If we only have conventional zones mapped, expose the mapped device
+ + as a regular device.
*/
- if (!dm_zone_map_bio_begin(md, zno, clone)) {
- dm_zone_unlock(md->disk, zno, clone);
- return DM_MAPIO_KILL;
- }
-
- /* Let the target do its work */
- r = ti->type->map(ti, clone);
- switch (r) {
- case DM_MAPIO_SUBMITTED:
- /*
- * The target submitted the clone BIO. The target zone will
- * be unlocked on completion of the clone.
- */
- sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
- *tio->len_ptr);
- break;
- case DM_MAPIO_REMAPPED:
- /*
- * The target only remapped the clone BIO. In case of error,
- * unlock the target zone here as the clone will not be
- * submitted.
- */
- sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
- *tio->len_ptr);
- if (sts != BLK_STS_OK)
- dm_zone_unlock(md->disk, zno, clone);
- break;
- case DM_MAPIO_REQUEUE:
- case DM_MAPIO_KILL:
- default:
- dm_zone_unlock(md->disk, zno, clone);
- sts = BLK_STS_IOERR;
- break;
+ if (!zlim.mapped_nr_seq_zones) {
+ lim->max_open_zones = 0;
+ lim->max_active_zones = 0;
+ lim->max_hw_zone_append_sectors = 0;
+ lim->zone_write_granularity = 0;
+ lim->chunk_sectors = 0;
+ lim->features &= ~BLK_FEAT_ZONED;
+ clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ md->nr_zones = 0;
+ disk->nr_zones = 0;
+ return 0;
}
- if (sts != BLK_STS_OK)
- return DM_MAPIO_KILL;
+ /*
+ * Warn once (when the capacity is not yet set) if the mapped device is
+ * partially using zone resources of the target devices as that leads to
+ * unreliable limits, i.e. if another mapped device uses the same
+ * underlying devices, we cannot enforce zone limits to guarantee that
+ * writing will not lead to errors. Note that we really should return
+ * an error for such case but there is no easy way to find out if
+ * another mapped device uses the same underlying zoned devices.
+ */
+ if (!get_capacity(disk) && !zlim.reliable_limits)
+ DMWARN("%s zone resource limits may be unreliable",
+ disk->disk_name);
- return r;
+ if (lim->features & BLK_FEAT_ZONED &&
+ !static_key_enabled(&zoned_enabled.key))
+ static_branch_enable(&zoned_enabled);
+ return 0;
}
/*
@@ -587,61 +416,53 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone)
struct mapped_device *md = io->md;
struct gendisk *disk = md->disk;
struct bio *orig_bio = io->orig_bio;
- unsigned int zwp_offset;
- unsigned int zno;
/*
- * For targets that do not emulate zone append, we only need to
- * handle native zone-append bios.
+ * Get the offset within the zone of the written sector
+ * and add that to the original bio sector position.
*/
- if (!dm_emulate_zone_append(md)) {
- /*
- * Get the offset within the zone of the written sector
- * and add that to the original bio sector position.
- */
- if (clone->bi_status == BLK_STS_OK &&
- bio_op(clone) == REQ_OP_ZONE_APPEND) {
- sector_t mask =
- (sector_t)bdev_zone_sectors(disk->part0) - 1;
-
- orig_bio->bi_iter.bi_sector +=
- clone->bi_iter.bi_sector & mask;
- }
+ if (clone->bi_status == BLK_STS_OK &&
+ bio_op(clone) == REQ_OP_ZONE_APPEND) {
+ sector_t mask = bdev_zone_sectors(disk->part0) - 1;
- return;
+ orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask;
}
+ return;
+}
+
+static int dm_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
+{
/*
- * For targets that do emulate zone append, if the clone BIO does not
- * own the target zone write lock, we have nothing to do.
+ * For an all-zones reset, ignore conventional, empty, read-only
+ * and offline zones.
*/
- if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
- return;
+ switch (zone->cond) {
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_READONLY:
+ case BLK_ZONE_COND_OFFLINE:
+ return 0;
+ default:
+ set_bit(idx, (unsigned long *)data);
+ return 0;
+ }
+}
- zno = bio_zone_no(orig_bio);
+int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
+ sector_t sector, unsigned int nr_zones,
+ unsigned long *need_reset)
+{
+ int ret;
- if (clone->bi_status != BLK_STS_OK) {
- /*
- * BIOs that modify a zone write pointer may leave the zone
- * in an unknown state in case of failure (e.g. the write
- * pointer was only partially advanced). In this case, set
- * the target zone write pointer as invalid unless it is
- * already being updated.
- */
- WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
- } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
- /*
- * Get the written sector for zone append operation that were
- * emulated using regular write operations.
- */
- zwp_offset = READ_ONCE(md->zwp_offset[zno]);
- if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
- WRITE_ONCE(md->zwp_offset[zno],
- DM_ZONE_INVALID_WP_OFST);
- else
- orig_bio->bi_iter.bi_sector +=
- zwp_offset - bio_sectors(orig_bio);
+ ret = dm_blk_do_report_zones(md, t, sector, nr_zones,
+ dm_zone_need_reset_cb, need_reset);
+ if (ret != nr_zones) {
+ DMERR("Get %s zone reset bitmap failed\n",
+ md->disk->disk_name);
+ return -EIO;
}
- dm_zone_unlock(disk, zno, clone);
+ return 0;
}
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 8156881a31de..deff22ecccbb 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -245,11 +245,6 @@ unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd)
return zmd->zone_nr_blocks;
}
-unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd)
-{
- return zmd->zone_nr_blocks_shift;
-}
-
unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd)
{
return zmd->zone_nr_sectors;
@@ -3005,48 +3000,3 @@ void dmz_dtr_metadata(struct dmz_metadata *zmd)
dmz_cleanup_metadata(zmd);
kfree(zmd);
}
-
-/*
- * Check zone information on resume.
- */
-int dmz_resume_metadata(struct dmz_metadata *zmd)
-{
- struct dm_zone *zone;
- sector_t wp_block;
- unsigned int i;
- int ret;
-
- /* Check zones */
- for (i = 0; i < zmd->nr_zones; i++) {
- zone = dmz_get(zmd, i);
- if (!zone) {
- dmz_zmd_err(zmd, "Unable to get zone %u", i);
- return -EIO;
- }
- wp_block = zone->wp_block;
-
- ret = dmz_update_zone(zmd, zone);
- if (ret) {
- dmz_zmd_err(zmd, "Broken zone %u", i);
- return ret;
- }
-
- if (dmz_is_offline(zone)) {
- dmz_zmd_warn(zmd, "Zone %u is offline", i);
- continue;
- }
-
- /* Check write pointer */
- if (!dmz_is_seq(zone))
- zone->wp_block = 0;
- else if (zone->wp_block != wp_block) {
- dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)",
- i, (u64)zone->wp_block, (u64)wp_block);
- zone->wp_block = wp_block;
- dmz_invalidate_blocks(zmd, zone, zone->wp_block,
- zmd->zone_nr_blocks - zone->wp_block);
- }
- }
-
- return 0;
-}
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index d58db9a27e6c..76e2c6868548 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -76,9 +76,9 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
* pointer and the requested position.
*/
nr_blocks = block - wp_block;
- ret = blkdev_issue_zeroout(dev->bdev,
- dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
- dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
+ ret = blk_zone_issue_zeroout(dev->bdev,
+ dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
+ dmz_blk2sect(nr_blocks), GFP_NOIO);
if (ret) {
dmz_dev_err(dev,
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 621794a9edd6..6141fc25d842 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -996,12 +996,11 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
limits->logical_block_size = DMZ_BLOCK_SIZE;
limits->physical_block_size = DMZ_BLOCK_SIZE;
- blk_limits_io_min(limits, DMZ_BLOCK_SIZE);
- blk_limits_io_opt(limits, DMZ_BLOCK_SIZE);
+ limits->io_min = DMZ_BLOCK_SIZE;
+ limits->io_opt = DMZ_BLOCK_SIZE;
limits->discard_alignment = 0;
limits->discard_granularity = DMZ_BLOCK_SIZE;
- limits->max_discard_sectors = chunk_sectors;
limits->max_hw_discard_sectors = chunk_sectors;
limits->max_write_zeroes_sectors = chunk_sectors;
@@ -1010,7 +1009,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
limits->max_sectors = chunk_sectors;
/* We are exposing a drive-managed zoned block device */
- limits->zoned = false;
+ limits->features &= ~BLK_FEAT_ZONED;
}
/*
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index 265494d3f711..59ba0aaa9531 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -192,7 +192,6 @@ enum {
int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
struct dmz_metadata **zmd, const char *devname);
void dmz_dtr_metadata(struct dmz_metadata *zmd);
-int dmz_resume_metadata(struct dmz_metadata *zmd);
void dmz_lock_map(struct dmz_metadata *zmd);
void dmz_unlock_map(struct dmz_metadata *zmd);
@@ -230,7 +229,6 @@ unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx);
unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx);
unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx);
unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd);
-unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd);
unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd);
unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7d0746b37c8e..4d1e42891d24 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -11,6 +11,7 @@
#include "dm-uevent.h"
#include "dm-ima.h"
+#include <linux/bio-integrity.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -645,7 +646,7 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
/* Set default bdev, but target must bio_set_dev() before issuing IO */
clone->bi_bdev = md->disk->part0;
- if (unlikely(ti->needs_bio_set_dev))
+ if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev))
bio_set_dev(clone, md->disk->part0);
if (len) {
@@ -1086,7 +1087,7 @@ void disable_discard(struct mapped_device *md)
struct queue_limits *limits = dm_get_queue_limits(md);
/* device doesn't really support DISCARD, disable it */
- limits->max_discard_sectors = 0;
+ limits->max_hw_discard_sectors = 0;
}
void disable_write_zeroes(struct mapped_device *md)
@@ -1107,7 +1108,7 @@ static void clone_endio(struct bio *bio)
blk_status_t error = bio->bi_status;
struct dm_target_io *tio = clone_to_tio(bio);
struct dm_target *ti = tio->ti;
- dm_endio_fn endio = ti->type->end_io;
+ dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL;
struct dm_io *io = tio->io;
struct mapped_device *md = io->md;
@@ -1154,7 +1155,7 @@ static void clone_endio(struct bio *bio)
}
if (static_branch_unlikely(&swap_bios_enabled) &&
- unlikely(swap_bios_limit(ti, bio)))
+ likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio)))
up(&md->swap_bios_semaphore);
free_tio(bio);
@@ -1188,7 +1189,7 @@ static sector_t __max_io_len(struct dm_target *ti, sector_t sector,
return len;
return min_t(sector_t, len,
min(max_sectors ? : queue_max_sectors(ti->table->md->queue),
- blk_chunk_sectors_left(target_offset, max_granularity)));
+ blk_boundary_sectors_left(target_offset, max_granularity)));
}
static inline sector_t max_io_len(struct dm_target *ti, sector_t sector)
@@ -1428,25 +1429,12 @@ static void __map_bio(struct bio *clone)
down(&md->swap_bios_semaphore);
}
- if (static_branch_unlikely(&zoned_enabled)) {
- /*
- * Check if the IO needs a special mapping due to zone append
- * emulation on zoned target. In this case, dm_zone_map_bio()
- * calls the target map operation.
- */
- if (unlikely(dm_emulate_zone_append(md)))
- r = dm_zone_map_bio(tio);
- else
- goto do_map;
- } else {
-do_map:
- if (likely(ti->type->map == linear_map))
- r = linear_map(ti, clone);
- else if (ti->type->map == stripe_map)
- r = stripe_map(ti, clone);
- else
- r = ti->type->map(ti, clone);
- }
+ if (likely(ti->type->map == linear_map))
+ r = linear_map(ti, clone);
+ else if (ti->type->map == stripe_map)
+ r = stripe_map(ti, clone);
+ else
+ r = ti->type->map(ti, clone);
switch (r) {
case DM_MAPIO_SUBMITTED:
@@ -1491,12 +1479,12 @@ static void setup_split_accounting(struct clone_info *ci, unsigned int len)
static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
struct dm_target *ti, unsigned int num_bios,
- unsigned *len, gfp_t gfp_flag)
+ unsigned *len)
{
struct bio *bio;
- int try = (gfp_flag & GFP_NOWAIT) ? 0 : 1;
+ int try;
- for (; try < 2; try++) {
+ for (try = 0; try < 2; try++) {
int bio_nr;
if (try && num_bios > 1)
@@ -1520,8 +1508,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
}
static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
- unsigned int num_bios, unsigned int *len,
- gfp_t gfp_flag)
+ unsigned int num_bios, unsigned int *len)
{
struct bio_list blist = BIO_EMPTY_LIST;
struct bio *clone;
@@ -1538,7 +1525,7 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
* Using alloc_multiple_bios(), even if num_bios is 1, to consistently
* support allocating using GFP_NOWAIT with GFP_NOIO fallback.
*/
- alloc_multiple_bios(&blist, ci, ti, num_bios, len, gfp_flag);
+ alloc_multiple_bios(&blist, ci, ti, num_bios, len);
while ((clone = bio_list_pop(&blist))) {
if (num_bios > 1)
dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO);
@@ -1566,17 +1553,43 @@ static void __send_empty_flush(struct clone_info *ci)
ci->sector_count = 0;
ci->io->tio.clone.bi_iter.bi_size = 0;
- for (unsigned int i = 0; i < t->num_targets; i++) {
- unsigned int bios;
- struct dm_target *ti = dm_table_get_target(t, i);
+ if (!t->flush_bypasses_map) {
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ unsigned int bios;
+ struct dm_target *ti = dm_table_get_target(t, i);
- if (unlikely(ti->num_flush_bios == 0))
- continue;
+ if (unlikely(ti->num_flush_bios == 0))
+ continue;
- atomic_add(ti->num_flush_bios, &ci->io->io_count);
- bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
- NULL, GFP_NOWAIT);
- atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
+ atomic_add(ti->num_flush_bios, &ci->io->io_count);
+ bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
+ NULL);
+ atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
+ }
+ } else {
+ /*
+ * Note that there's no need to grab t->devices_lock here
+ * because the targets that support flush optimization don't
+ * modify the list of devices.
+ */
+ struct list_head *devices = dm_table_get_devices(t);
+ unsigned int len = 0;
+ struct dm_dev_internal *dd;
+ list_for_each_entry(dd, devices, list) {
+ struct bio *clone;
+ /*
+ * Note that the structure dm_target_io is not
+ * associated with any target (because the device may be
+ * used by multiple targets), so we set tio->ti = NULL.
+ * We must check for NULL in the I/O processing path, to
+ * avoid NULL pointer dereference.
+ */
+ clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO);
+ atomic_add(1, &ci->io->io_count);
+ bio_set_dev(clone, dd->dm_dev->bdev);
+ clone->bi_end_io = clone_endio;
+ dm_submit_bio_remap(clone, NULL);
+ }
}
/*
@@ -1598,7 +1611,7 @@ static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti,
__max_io_len(ti, ci->sector, max_granularity, max_sectors));
atomic_add(num_bios, &ci->io->io_count);
- bios = __send_duplicate_bios(ci, ti, num_bios, &len, GFP_NOIO);
+ bios = __send_duplicate_bios(ci, ti, num_bios, &len);
/*
* alloc_io() takes one extra reference for submission, so the
* reference won't reach 0 without the following (+1) subtraction
@@ -1611,20 +1624,19 @@ static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti,
static bool is_abnormal_io(struct bio *bio)
{
- enum req_op op = bio_op(bio);
-
- if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) {
- switch (op) {
- case REQ_OP_DISCARD:
- case REQ_OP_SECURE_ERASE:
- case REQ_OP_WRITE_ZEROES:
- return true;
- default:
- break;
- }
+ switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ case REQ_OP_WRITE:
+ case REQ_OP_FLUSH:
+ return false;
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_ZONE_RESET_ALL:
+ return true;
+ default:
+ return false;
}
-
- return false;
}
static blk_status_t __process_abnormal_io(struct clone_info *ci,
@@ -1645,14 +1657,10 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
case REQ_OP_SECURE_ERASE:
num_bios = ti->num_secure_erase_bios;
max_sectors = limits->max_secure_erase_sectors;
- if (ti->max_secure_erase_granularity)
- max_granularity = max_sectors;
break;
case REQ_OP_WRITE_ZEROES:
num_bios = ti->num_write_zeroes_bios;
max_sectors = limits->max_write_zeroes_sectors;
- if (ti->max_write_zeroes_granularity)
- max_granularity = max_sectors;
break;
default:
break;
@@ -1737,6 +1745,9 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci)
ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
+ if (ci->bio->bi_opf & REQ_ATOMIC && len != ci->sector_count)
+ return BLK_STS_IOERR;
+
setup_split_accounting(ci, len);
if (unlikely(ci->bio->bi_opf & REQ_NOWAIT)) {
@@ -1774,6 +1785,150 @@ static void init_clone_info(struct clone_info *ci, struct dm_io *io,
ci->sector_count = 0;
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+ struct bio *bio)
+{
+ /*
+ * For mapped device that need zone append emulation, we must
+ * split any large BIO that straddles zone boundaries.
+ */
+ return dm_emulate_zone_append(md) && bio_straddles_zones(bio) &&
+ !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+}
+static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+{
+ return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
+}
+
+static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci,
+ struct dm_target *ti)
+{
+ struct bio_list blist = BIO_EMPTY_LIST;
+ struct mapped_device *md = ci->io->md;
+ unsigned int zone_sectors = md->disk->queue->limits.chunk_sectors;
+ unsigned long *need_reset;
+ unsigned int i, nr_zones, nr_reset;
+ unsigned int num_bios = 0;
+ blk_status_t sts = BLK_STS_OK;
+ sector_t sector = ti->begin;
+ struct bio *clone;
+ int ret;
+
+ nr_zones = ti->len >> ilog2(zone_sectors);
+ need_reset = bitmap_zalloc(nr_zones, GFP_NOIO);
+ if (!need_reset)
+ return BLK_STS_RESOURCE;
+
+ ret = dm_zone_get_reset_bitmap(md, ci->map, ti->begin,
+ nr_zones, need_reset);
+ if (ret) {
+ sts = BLK_STS_IOERR;
+ goto free_bitmap;
+ }
+
+ /* If we have no zone to reset, we are done. */
+ nr_reset = bitmap_weight(need_reset, nr_zones);
+ if (!nr_reset)
+ goto free_bitmap;
+
+ atomic_add(nr_zones, &ci->io->io_count);
+
+ for (i = 0; i < nr_zones; i++) {
+
+ if (!test_bit(i, need_reset)) {
+ sector += zone_sectors;
+ continue;
+ }
+
+ if (bio_list_empty(&blist)) {
+ /* This may take a while, so be nice to others */
+ if (num_bios)
+ cond_resched();
+
+ /*
+ * We may need to reset thousands of zones, so let's
+ * not go crazy with the clone allocation.
+ */
+ alloc_multiple_bios(&blist, ci, ti, min(nr_reset, 32),
+ NULL);
+ }
+
+ /* Get a clone and change it to a regular reset operation. */
+ clone = bio_list_pop(&blist);
+ clone->bi_opf &= ~REQ_OP_MASK;
+ clone->bi_opf |= REQ_OP_ZONE_RESET | REQ_SYNC;
+ clone->bi_iter.bi_sector = sector;
+ clone->bi_iter.bi_size = 0;
+ __map_bio(clone);
+
+ sector += zone_sectors;
+ num_bios++;
+ nr_reset--;
+ }
+
+ WARN_ON_ONCE(!bio_list_empty(&blist));
+ atomic_sub(nr_zones - num_bios, &ci->io->io_count);
+ ci->sector_count = 0;
+
+free_bitmap:
+ bitmap_free(need_reset);
+
+ return sts;
+}
+
+static void __send_zone_reset_all_native(struct clone_info *ci,
+ struct dm_target *ti)
+{
+ unsigned int bios;
+
+ atomic_add(1, &ci->io->io_count);
+ bios = __send_duplicate_bios(ci, ti, 1, NULL);
+ atomic_sub(1 - bios, &ci->io->io_count);
+
+ ci->sector_count = 0;
+}
+
+static blk_status_t __send_zone_reset_all(struct clone_info *ci)
+{
+ struct dm_table *t = ci->map;
+ blk_status_t sts = BLK_STS_OK;
+
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (ti->zone_reset_all_supported) {
+ __send_zone_reset_all_native(ci, ti);
+ continue;
+ }
+
+ sts = __send_zone_reset_all_emulated(ci, ti);
+ if (sts != BLK_STS_OK)
+ break;
+ }
+
+ /* Release the reference that alloc_io() took for submission. */
+ atomic_sub(1, &ci->io->io_count);
+
+ return sts;
+}
+
+#else
+static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+ struct bio *bio)
+{
+ return false;
+}
+static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+{
+ return false;
+}
+static blk_status_t __send_zone_reset_all(struct clone_info *ci)
+{
+ return BLK_STS_NOTSUPP;
+}
+#endif
+
/*
* Entry point to split a bio into clones and submit them to the targets.
*/
@@ -1783,21 +1938,48 @@ static void dm_split_and_process_bio(struct mapped_device *md,
struct clone_info ci;
struct dm_io *io;
blk_status_t error = BLK_STS_OK;
- bool is_abnormal;
+ bool is_abnormal, need_split;
is_abnormal = is_abnormal_io(bio);
- if (unlikely(is_abnormal)) {
+ if (static_branch_unlikely(&zoned_enabled)) {
+ /* Special case REQ_OP_ZONE_RESET_ALL as it cannot be split. */
+ need_split = (bio_op(bio) != REQ_OP_ZONE_RESET_ALL) &&
+ (is_abnormal || dm_zone_bio_needs_split(md, bio));
+ } else {
+ need_split = is_abnormal;
+ }
+
+ if (unlikely(need_split)) {
/*
* Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
* otherwise associated queue_limits won't be imposed.
+ * Also split the BIO for mapped devices needing zone append
+ * emulation to ensure that the BIO does not cross zone
+ * boundaries.
*/
bio = bio_split_to_limits(bio);
if (!bio)
return;
}
+ /*
+ * Use the block layer zone write plugging for mapped devices that
+ * need zone append emulation (e.g. dm-crypt).
+ */
+ if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio))
+ return;
+
/* Only support nowait for normal IO */
if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {
+ /*
+ * Don't support NOWAIT for FLUSH because it may allocate
+ * multiple bios and there's no easy way how to undo the
+ * allocations.
+ */
+ if (bio->bi_opf & REQ_PREFLUSH) {
+ bio_wouldblock_error(bio);
+ return;
+ }
io = alloc_io(md, bio, GFP_NOWAIT);
if (unlikely(!io)) {
/* Unable to do anything without dm_io. */
@@ -1815,6 +1997,12 @@ static void dm_split_and_process_bio(struct mapped_device *md,
goto out;
}
+ if (static_branch_unlikely(&zoned_enabled) &&
+ (bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) {
+ error = __send_zone_reset_all(&ci);
+ goto out;
+ }
+
error = __split_and_process_bio(&ci);
if (error || !ci.sector_count)
goto out;
@@ -1853,10 +2041,15 @@ static void dm_submit_bio(struct bio *bio)
struct dm_table *map;
map = dm_get_live_table(md, &srcu_idx);
+ if (unlikely(!map)) {
+ DMERR_LIMIT("%s: mapping table unavailable, erroring io",
+ dm_device_name(md));
+ bio_io_error(bio);
+ goto out;
+ }
- /* If suspended, or map not yet available, queue this IO for later */
- if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) ||
- unlikely(!map)) {
+ /* If suspended, queue this IO for later */
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
else if (bio->bi_opf & REQ_RAHEAD)
@@ -2016,7 +2209,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
md->dax_dev = NULL;
}
- dm_cleanup_zoned_dev(md);
if (md->disk) {
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
@@ -2109,8 +2301,10 @@ static struct mapped_device *alloc_dev(int minor)
* override accordingly.
*/
md->disk = blk_alloc_disk(NULL, md->numa_node_id);
- if (IS_ERR(md->disk))
+ if (IS_ERR(md->disk)) {
+ md->disk = NULL;
goto bad;
+ }
md->queue = md->disk->queue;
init_waitqueue_head(&md->wait);
@@ -2334,12 +2528,6 @@ void dm_unlock_md_type(struct mapped_device *md)
mutex_unlock(&md->type_lock);
}
-void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
-{
- BUG_ON(!mutex_is_locked(&md->type_lock));
- md->type = type;
-}
-
enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
{
return md->type;
@@ -2360,22 +2548,15 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
struct table_device *td;
int r;
- switch (type) {
- case DM_TYPE_REQUEST_BASED:
+ WARN_ON_ONCE(type == DM_TYPE_NONE);
+
+ if (type == DM_TYPE_REQUEST_BASED) {
md->disk->fops = &dm_rq_blk_dops;
r = dm_mq_init_request_queue(md, t);
if (r) {
DMERR("Cannot initialize queue for request-based dm mapped device");
return r;
}
- break;
- case DM_TYPE_BIO_BASED:
- case DM_TYPE_DAX_BIO_BASED:
- blk_queue_flag_set(QUEUE_FLAG_IO_STAT, md->queue);
- break;
- case DM_TYPE_NONE:
- WARN_ON_ONCE(true);
- break;
}
r = dm_calculate_queue_limits(t, &limits);
@@ -2568,7 +2749,7 @@ static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int ta
break;
if (signal_pending_state(task_state, current)) {
- r = -EINTR;
+ r = -ERESTARTSYS;
break;
}
@@ -2593,7 +2774,7 @@ static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_st
break;
if (signal_pending_state(task_state, current)) {
- r = -EINTR;
+ r = -ERESTARTSYS;
break;
}
@@ -3173,6 +3354,59 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
kfree(pools);
}
+struct dm_blkdev_id {
+ u8 *id;
+ enum blk_unique_id type;
+};
+
+static int __dm_get_unique_id(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct dm_blkdev_id *dm_id = data;
+ const struct block_device_operations *fops = dev->bdev->bd_disk->fops;
+
+ if (!fops->get_unique_id)
+ return 0;
+
+ return fops->get_unique_id(dev->bdev->bd_disk, dm_id->id, dm_id->type);
+}
+
+/*
+ * Allow access to get_unique_id() for the first device returning a
+ * non-zero result. Reasonable use expects all devices to have the
+ * same unique id.
+ */
+static int dm_blk_get_unique_id(struct gendisk *disk, u8 *id,
+ enum blk_unique_id type)
+{
+ struct mapped_device *md = disk->private_data;
+ struct dm_table *table;
+ struct dm_target *ti;
+ int ret = 0, srcu_idx;
+
+ struct dm_blkdev_id dm_id = {
+ .id = id,
+ .type = type,
+ };
+
+ table = dm_get_live_table(md, &srcu_idx);
+ if (!table || !dm_table_get_size(table))
+ goto out;
+
+ /* We only support devices that have a single target */
+ if (table->num_targets != 1)
+ goto out;
+ ti = dm_table_get_target(table, 0);
+
+ if (!ti->type->iterate_devices)
+ goto out;
+
+ ret = ti->type->iterate_devices(ti, __dm_get_unique_id, &dm_id);
+out:
+ dm_put_live_table(md, srcu_idx);
+ return ret;
+}
+
struct dm_pr {
u64 old_key;
u64 new_key;
@@ -3498,6 +3732,7 @@ static const struct block_device_operations dm_blk_dops = {
.ioctl = dm_blk_ioctl,
.getgeo = dm_blk_getgeo,
.report_zones = dm_blk_report_zones,
+ .get_unique_id = dm_blk_get_unique_id,
.pr_ops = &dm_pr_ops,
.owner = THIS_MODULE
};
@@ -3507,6 +3742,7 @@ static const struct block_device_operations dm_rq_blk_dops = {
.release = dm_blk_close,
.ioctl = dm_blk_ioctl,
.getgeo = dm_blk_getgeo,
+ .get_unique_id = dm_blk_get_unique_id,
.pr_ops = &dm_pr_ops,
.owner = THIS_MODULE
};
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 7f1acbf6bd9e..a0a8ff119815 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -71,12 +71,10 @@ enum dm_queue_mode dm_table_get_type(struct dm_table *t);
struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
-bool dm_table_bio_based(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
-void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type);
enum dm_queue_mode dm_get_md_type(struct mapped_device *md);
struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
@@ -101,25 +99,23 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
/*
* Zoned targets related functions.
*/
-int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q);
+int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
+ struct queue_limits *lim);
+int dm_revalidate_zones(struct dm_table *t, struct request_queue *q);
void dm_zone_endio(struct dm_io *io, struct bio *clone);
#ifdef CONFIG_BLK_DEV_ZONED
-void dm_cleanup_zoned_dev(struct mapped_device *md);
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
-int dm_zone_map_bio(struct dm_target_io *io);
+int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
+ sector_t sector, unsigned int nr_zones,
+ unsigned long *need_reset);
#else
-static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {}
#define dm_blk_report_zones NULL
static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
{
return false;
}
-static inline int dm_zone_map_bio(struct dm_target_io *tio)
-{
- return DM_MAPIO_KILL;
-}
#endif
/*
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index b2a00f213c2c..4b80165afd23 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -49,6 +49,7 @@ static int md_setup_ents __initdata;
* instead of just one. -- KTK
* 18May2000: Added support for persistent-superblock arrays:
* md=n,0,factor,fault,device-list uses RAID0 for device n
+ * md=n,-1,factor,fault,device-list uses LINEAR for device n
* md=n,device-list reads a RAID superblock from the devices
* elements in device-list are read by name_to_kdev_t so can be
* a hex number or something like /dev/hda1 /dev/sdb
@@ -87,7 +88,7 @@ static int __init md_setup(char *str)
md_setup_ents++;
switch (get_option(&str, &level)) { /* RAID level */
case 2: /* could be 0 or -1.. */
- if (level == 0) {
+ if (level == 0 || level == LEVEL_LINEAR) {
if (get_option(&str, &factor) != 2 || /* Chunk Size */
get_option(&str, &fault) != 2) {
printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
@@ -95,7 +96,10 @@ static int __init md_setup(char *str)
}
md_setup_args[ent].level = level;
md_setup_args[ent].chunk = 1 << (factor+12);
- pername = "raid0";
+ if (level == LEVEL_LINEAR)
+ pername = "linear";
+ else
+ pername = "raid0";
break;
}
fallthrough;
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 059afc24c08b..23c09d22fcdb 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -32,11 +32,210 @@
#include "md.h"
#include "md-bitmap.h"
+#define BITMAP_MAJOR_LO 3
+/* version 4 insists the bitmap is in little-endian order
+ * with version 3, it is host-endian which is non-portable
+ * Version 5 is currently set only for clustered devices
+ */
+#define BITMAP_MAJOR_HI 4
+#define BITMAP_MAJOR_CLUSTERED 5
+#define BITMAP_MAJOR_HOSTENDIAN 3
+
+/*
+ * in-memory bitmap:
+ *
+ * Use 16 bit block counters to track pending writes to each "chunk".
+ * The 2 high order bits are special-purpose, the first is a flag indicating
+ * whether a resync is needed. The second is a flag indicating whether a
+ * resync is active.
+ * This means that the counter is actually 14 bits:
+ *
+ * +--------+--------+------------------------------------------------+
+ * | resync | resync | counter |
+ * | needed | active | |
+ * | (0-1) | (0-1) | (0-16383) |
+ * +--------+--------+------------------------------------------------+
+ *
+ * The "resync needed" bit is set when:
+ * a '1' bit is read from storage at startup.
+ * a write request fails on some drives
+ * a resync is aborted on a chunk with 'resync active' set
+ * It is cleared (and resync-active set) when a resync starts across all drives
+ * of the chunk.
+ *
+ *
+ * The "resync active" bit is set when:
+ * a resync is started on all drives, and resync_needed is set.
+ * resync_needed will be cleared (as long as resync_active wasn't already set).
+ * It is cleared when a resync completes.
+ *
+ * The counter counts pending write requests, plus the on-disk bit.
+ * When the counter is '1' and the resync bits are clear, the on-disk
+ * bit can be cleared as well, thus setting the counter to 0.
+ * When we set a bit, or in the counter (to start a write), if the fields is
+ * 0, we first set the disk bit and set the counter to 1.
+ *
+ * If the counter is 0, the on-disk bit is clear and the stripe is clean
+ * Anything that dirties the stripe pushes the counter to 2 (at least)
+ * and sets the on-disk bit (lazily).
+ * If a periodic sweep find the counter at 2, it is decremented to 1.
+ * If the sweep find the counter at 1, the on-disk bit is cleared and the
+ * counter goes to zero.
+ *
+ * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
+ * counters as a fallback when "page" memory cannot be allocated:
+ *
+ * Normal case (page memory allocated):
+ *
+ * page pointer (32-bit)
+ *
+ * [ ] ------+
+ * |
+ * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
+ * c1 c2 c2048
+ *
+ * Hijacked case (page memory allocation failed):
+ *
+ * hijacked page pointer (32-bit)
+ *
+ * [ ][ ] (no page memory allocated)
+ * counter #1 (16-bit) counter #2 (16-bit)
+ *
+ */
+
+#define PAGE_BITS (PAGE_SIZE << 3)
+#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+
+#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
+#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
+#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
+
+/* how many counters per page? */
+#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
+/* same, except a shift value for more efficient bitops */
+#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
+/* same, except a mask value for more efficient bitops */
+#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
+
+#define BITMAP_BLOCK_SHIFT 9
+
+/*
+ * bitmap structures:
+ */
+
+/* the in-memory bitmap is represented by bitmap_pages */
+struct bitmap_page {
+ /*
+ * map points to the actual memory page
+ */
+ char *map;
+ /*
+ * in emergencies (when map cannot be alloced), hijack the map
+ * pointer and use it as two counters itself
+ */
+ unsigned int hijacked:1;
+ /*
+ * If any counter in this page is '1' or '2' - and so could be
+ * cleared then that page is marked as 'pending'
+ */
+ unsigned int pending:1;
+ /*
+ * count of dirty bits on the page
+ */
+ unsigned int count:30;
+};
+
+/* the main bitmap structure - one per mddev */
+struct bitmap {
+
+ struct bitmap_counts {
+ spinlock_t lock;
+ struct bitmap_page *bp;
+ /* total number of pages in the bitmap */
+ unsigned long pages;
+ /* number of pages not yet allocated */
+ unsigned long missing_pages;
+ /* chunksize = 2^chunkshift (for bitops) */
+ unsigned long chunkshift;
+ /* total number of data chunks for the array */
+ unsigned long chunks;
+ } counts;
+
+ struct mddev *mddev; /* the md device that the bitmap is for */
+
+ __u64 events_cleared;
+ int need_sync;
+
+ struct bitmap_storage {
+ /* backing disk file */
+ struct file *file;
+ /* cached copy of the bitmap file superblock */
+ struct page *sb_page;
+ unsigned long sb_index;
+ /* list of cache pages for the file */
+ struct page **filemap;
+ /* attributes associated filemap pages */
+ unsigned long *filemap_attr;
+ /* number of pages in the file */
+ unsigned long file_pages;
+ /* total bytes in the bitmap */
+ unsigned long bytes;
+ } storage;
+
+ unsigned long flags;
+
+ int allclean;
+
+ atomic_t behind_writes;
+ /* highest actual value at runtime */
+ unsigned long behind_writes_used;
+
+ /*
+ * the bitmap daemon - periodically wakes up and sweeps the bitmap
+ * file, cleaning up bits and flushing out pages to disk as necessary
+ */
+ unsigned long daemon_lastrun; /* jiffies of last run */
+ /*
+ * when we lasted called end_sync to update bitmap with resync
+ * progress.
+ */
+ unsigned long last_end_sync;
+
+ /* pending writes to the bitmap file */
+ atomic_t pending_writes;
+ wait_queue_head_t write_wait;
+ wait_queue_head_t overflow_wait;
+ wait_queue_head_t behind_wait;
+
+ struct kernfs_node *sysfs_can_clear;
+ /* slot offset for clustered env */
+ int cluster_slot;
+};
+
+static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
+ int chunksize, bool init);
+
static inline char *bmname(struct bitmap *bitmap)
{
return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
}
+static bool __bitmap_enabled(struct bitmap *bitmap)
+{
+ return bitmap->storage.filemap &&
+ !test_bit(BITMAP_STALE, &bitmap->flags);
+}
+
+static bool bitmap_enabled(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return false;
+
+ return __bitmap_enabled(bitmap);
+}
+
/*
* check a page and, if necessary, allocate it (or hijack it if the alloc fails)
*
@@ -227,6 +426,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
struct block_device *bdev;
struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage;
+ unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) <<
+ PAGE_SHIFT;
loff_t sboff, offset = mddev->bitmap_info.offset;
sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
unsigned int size = PAGE_SIZE;
@@ -269,11 +470,9 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
if (size == 0)
/* bitmap runs in to data */
return -EINVAL;
- } else {
- /* DATA METADATA BITMAP - no problems */
}
- md_super_write(mddev, rdev, sboff + ps, (int) size, page);
+ md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page);
return 0;
}
@@ -360,7 +559,7 @@ static int read_file_page(struct file *file, unsigned long index,
pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
(unsigned long long)index << PAGE_SHIFT);
- bh = alloc_page_buffers(page, blocksize, false);
+ bh = alloc_page_buffers(page, blocksize);
if (!bh) {
ret = -ENOMEM;
goto out;
@@ -472,9 +671,10 @@ static void md_bitmap_wait_writes(struct bitmap *bitmap)
/* update the event counter and sync the superblock to disk */
-void md_bitmap_update_sb(struct bitmap *bitmap)
+static void bitmap_update_sb(void *data)
{
bitmap_super_t *sb;
+ struct bitmap *bitmap = data;
if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
return;
@@ -482,7 +682,7 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
return;
if (!bitmap->storage.sb_page) /* no superblock */
return;
- sb = kmap_atomic(bitmap->storage.sb_page);
+ sb = kmap_local_page(bitmap->storage.sb_page);
sb->events = cpu_to_le64(bitmap->mddev->events);
if (bitmap->mddev->events < bitmap->events_cleared)
/* rocking back to read-only */
@@ -502,7 +702,7 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
bitmap_info.space);
- kunmap_atomic(sb);
+ kunmap_local(sb);
if (bitmap->storage.file)
write_file_page(bitmap, bitmap->storage.sb_page, 1);
@@ -510,16 +710,14 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
write_sb_page(bitmap, bitmap->storage.sb_index,
bitmap->storage.sb_page, 1);
}
-EXPORT_SYMBOL(md_bitmap_update_sb);
-/* print out the bitmap file superblock */
-void md_bitmap_print_sb(struct bitmap *bitmap)
+static void bitmap_print_sb(struct bitmap *bitmap)
{
bitmap_super_t *sb;
if (!bitmap || !bitmap->storage.sb_page)
return;
- sb = kmap_atomic(bitmap->storage.sb_page);
+ sb = kmap_local_page(bitmap->storage.sb_page);
pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
pr_debug(" version: %u\n", le32_to_cpu(sb->version));
@@ -538,7 +736,7 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
pr_debug(" sync size: %llu KB\n",
(unsigned long long)le64_to_cpu(sb->sync_size)/2);
pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind));
- kunmap_atomic(sb);
+ kunmap_local(sb);
}
/*
@@ -562,7 +760,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
return -ENOMEM;
bitmap->storage.sb_index = 0;
- sb = kmap_atomic(bitmap->storage.sb_page);
+ sb = kmap_local_page(bitmap->storage.sb_page);
sb->magic = cpu_to_le32(BITMAP_MAGIC);
sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
@@ -570,7 +768,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
chunksize = bitmap->mddev->bitmap_info.chunksize;
BUG_ON(!chunksize);
if (!is_power_of_2(chunksize)) {
- kunmap_atomic(sb);
+ kunmap_local(sb);
pr_warn("bitmap chunksize not a power of 2\n");
return -EINVAL;
}
@@ -605,7 +803,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
bitmap->mddev->bitmap_info.nodes = 0;
- kunmap_atomic(sb);
+ kunmap_local(sb);
return 0;
}
@@ -667,7 +865,7 @@ re_read:
return err;
err = -EINVAL;
- sb = kmap_atomic(sb_page);
+ sb = kmap_local_page(sb_page);
chunksize = le32_to_cpu(sb->chunksize);
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
@@ -734,7 +932,7 @@ re_read:
err = 0;
out:
- kunmap_atomic(sb);
+ kunmap_local(sb);
if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
/* Assigning chunksize is required for "re_read" */
bitmap->mddev->bitmap_info.chunksize = chunksize;
@@ -760,7 +958,7 @@ out_no_sb:
bitmap->mddev->bitmap_info.space > sectors_reserved)
bitmap->mddev->bitmap_info.space = sectors_reserved;
} else {
- md_bitmap_print_sb(bitmap);
+ bitmap_print_sb(bitmap);
if (bitmap->cluster_slot < 0)
md_cluster_stop(bitmap->mddev);
}
@@ -893,7 +1091,7 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store)
static void md_bitmap_file_kick(struct bitmap *bitmap)
{
if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
- md_bitmap_update_sb(bitmap);
+ bitmap_update_sb(bitmap);
if (bitmap->storage.file) {
pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
@@ -963,12 +1161,12 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
bit = file_page_offset(&bitmap->storage, chunk);
/* set the bit */
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
set_bit(bit, kaddr);
else
set_bit_le(bit, kaddr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
pr_debug("set file bit %lu page %lu\n", bit, index);
/* record page number so it gets flushed to disk when unplug occurs */
set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
@@ -992,12 +1190,12 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
if (!page)
return;
bit = file_page_offset(&bitmap->storage, chunk);
- paddr = kmap_atomic(page);
+ paddr = kmap_local_page(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
clear_bit(bit, paddr);
else
clear_bit_le(bit, paddr);
- kunmap_atomic(paddr);
+ kunmap_local(paddr);
if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
@@ -1016,25 +1214,25 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
if (!page)
return -EINVAL;
bit = file_page_offset(&bitmap->storage, chunk);
- paddr = kmap_atomic(page);
+ paddr = kmap_local_page(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
set = test_bit(bit, paddr);
else
set = test_bit_le(bit, paddr);
- kunmap_atomic(paddr);
+ kunmap_local(paddr);
return set;
}
/* this gets called when the md device is ready to unplug its underlying
* (slave) device queues -- before we let any writes go down, we need to
* sync the dirty pages of the bitmap file to disk */
-void md_bitmap_unplug(struct bitmap *bitmap)
+static void __bitmap_unplug(struct bitmap *bitmap)
{
unsigned long i;
int dirty, need_write;
int writing = 0;
- if (!md_bitmap_enabled(bitmap))
+ if (!__bitmap_enabled(bitmap))
return;
/* look at each page to see if there are any set bits that need to be
@@ -1060,7 +1258,6 @@ void md_bitmap_unplug(struct bitmap *bitmap)
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
md_bitmap_file_kick(bitmap);
}
-EXPORT_SYMBOL(md_bitmap_unplug);
struct bitmap_unplug_work {
struct work_struct work;
@@ -1073,11 +1270,11 @@ static void md_bitmap_unplug_fn(struct work_struct *work)
struct bitmap_unplug_work *unplug_work =
container_of(work, struct bitmap_unplug_work, work);
- md_bitmap_unplug(unplug_work->bitmap);
+ __bitmap_unplug(unplug_work->bitmap);
complete(unplug_work->done);
}
-void md_bitmap_unplug_async(struct bitmap *bitmap)
+static void bitmap_unplug_async(struct bitmap *bitmap)
{
DECLARE_COMPLETION_ONSTACK(done);
struct bitmap_unplug_work unplug_work;
@@ -1088,8 +1285,21 @@ void md_bitmap_unplug_async(struct bitmap *bitmap)
queue_work(md_bitmap_wq, &unplug_work.work);
wait_for_completion(&done);
+ destroy_work_on_stack(&unplug_work.work);
+}
+
+static void bitmap_unplug(struct mddev *mddev, bool sync)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return;
+
+ if (sync)
+ __bitmap_unplug(bitmap);
+ else
+ bitmap_unplug_async(bitmap);
}
-EXPORT_SYMBOL(md_bitmap_unplug_async);
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
@@ -1178,9 +1388,9 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
* If the bitmap is out of date, dirty the whole page
* and write it out
*/
- paddr = kmap_atomic(page);
+ paddr = kmap_local_page(page);
memset(paddr + offset, 0xff, PAGE_SIZE - offset);
- kunmap_atomic(paddr);
+ kunmap_local(paddr);
filemap_write_page(bitmap, i, true);
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
@@ -1196,12 +1406,12 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
void *paddr;
bool was_set;
- paddr = kmap_atomic(page);
+ paddr = kmap_local_page(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
was_set = test_bit(bit, paddr);
else
was_set = test_bit_le(bit, paddr);
- kunmap_atomic(paddr);
+ kunmap_local(paddr);
if (was_set) {
/* if the disk bit is set, set the memory bit */
@@ -1226,22 +1436,21 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
return ret;
}
-void md_bitmap_write_all(struct bitmap *bitmap)
+/* just flag bitmap pages as needing to be written. */
+static void bitmap_write_all(struct mddev *mddev)
{
- /* We don't actually write all bitmap blocks here,
- * just flag them as needing to be written
- */
int i;
+ struct bitmap *bitmap = mddev->bitmap;
if (!bitmap || !bitmap->storage.filemap)
return;
+
+ /* Only one copy, so nothing needed */
if (bitmap->storage.file)
- /* Only one copy, so nothing needed */
return;
for (i = 0; i < bitmap->storage.file_pages; i++)
- set_page_attr(bitmap, i,
- BITMAP_PAGE_NEEDWRITE);
+ set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
bitmap->allclean = 0;
}
@@ -1290,7 +1499,7 @@ out:
* bitmap daemon -- periodically wakes up to clean bits and flush pages
* out to disk
*/
-void md_bitmap_daemon_work(struct mddev *mddev)
+static void bitmap_daemon_work(struct mddev *mddev)
{
struct bitmap *bitmap;
unsigned long j;
@@ -1337,10 +1546,10 @@ void md_bitmap_daemon_work(struct mddev *mddev)
bitmap_super_t *sb;
bitmap->need_sync = 0;
if (bitmap->storage.filemap) {
- sb = kmap_atomic(bitmap->storage.sb_page);
+ sb = kmap_local_page(bitmap->storage.sb_page);
sb->events_cleared =
cpu_to_le64(bitmap->events_cleared);
- kunmap_atomic(sb);
+ kunmap_local(sb);
set_page_attr(bitmap, 0,
BITMAP_PAGE_NEEDWRITE);
}
@@ -1424,7 +1633,7 @@ __acquires(bitmap->lock)
sector_t chunk = offset >> bitmap->chunkshift;
unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
- sector_t csize;
+ sector_t csize = ((sector_t)1) << bitmap->chunkshift;
int err;
if (page >= bitmap->pages) {
@@ -1433,6 +1642,7 @@ __acquires(bitmap->lock)
* End-of-device while looking for a whole page or
* user set a huge number to sysfs bitmap_set_bits.
*/
+ *blocks = csize - (offset & (csize - 1));
return NULL;
}
err = md_bitmap_checkpage(bitmap, page, create, 0);
@@ -1441,8 +1651,7 @@ __acquires(bitmap->lock)
bitmap->bp[page].map == NULL)
csize = ((sector_t)1) << (bitmap->chunkshift +
PAGE_COUNTER_SHIFT);
- else
- csize = ((sector_t)1) << bitmap->chunkshift;
+
*blocks = csize - (offset & (csize - 1));
if (err < 0)
@@ -1461,22 +1670,14 @@ __acquires(bitmap->lock)
&(bitmap->bp[page].map[pageoff]);
}
-int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind)
+static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
{
+ struct bitmap *bitmap = mddev->bitmap;
+
if (!bitmap)
return 0;
- if (behind) {
- int bw;
- atomic_inc(&bitmap->behind_writes);
- bw = atomic_read(&bitmap->behind_writes);
- if (bw > bitmap->behind_writes_used)
- bitmap->behind_writes_used = bw;
-
- pr_debug("inc write-behind count %d/%lu\n",
- bw, bitmap->mddev->bitmap_info.max_write_behind);
- }
-
while (sectors) {
sector_t blocks;
bitmap_counter_t *bmc;
@@ -1523,20 +1724,14 @@ int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long s
}
return 0;
}
-EXPORT_SYMBOL(md_bitmap_startwrite);
-void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
- unsigned long sectors, int success, int behind)
+static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
{
+ struct bitmap *bitmap = mddev->bitmap;
+
if (!bitmap)
return;
- if (behind) {
- if (atomic_dec_and_test(&bitmap->behind_writes))
- wake_up(&bitmap->behind_wait);
- pr_debug("dec write-behind count %d/%lu\n",
- atomic_read(&bitmap->behind_writes),
- bitmap->mddev->bitmap_info.max_write_behind);
- }
while (sectors) {
sector_t blocks;
@@ -1550,15 +1745,16 @@ void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
return;
}
- if (success && !bitmap->mddev->degraded &&
- bitmap->events_cleared < bitmap->mddev->events) {
- bitmap->events_cleared = bitmap->mddev->events;
- bitmap->need_sync = 1;
- sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
- }
-
- if (!success && !NEEDED(*bmc))
+ if (!bitmap->mddev->degraded) {
+ if (bitmap->events_cleared < bitmap->mddev->events) {
+ bitmap->events_cleared = bitmap->mddev->events;
+ bitmap->need_sync = 1;
+ sysfs_notify_dirent_safe(
+ bitmap->sysfs_can_clear);
+ }
+ } else if (!NEEDED(*bmc)) {
*bmc |= NEEDED_MASK;
+ }
if (COUNTER(*bmc) == COUNTER_MAX)
wake_up(&bitmap->overflow_wait);
@@ -1576,26 +1772,27 @@ void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
sectors = 0;
}
}
-EXPORT_SYMBOL(md_bitmap_endwrite);
-static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
- int degraded)
+static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
+ sector_t *blocks, bool degraded)
{
bitmap_counter_t *bmc;
- int rv;
+ bool rv;
+
if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
*blocks = 1024;
- return 1; /* always resync if no bitmap */
+ return true; /* always resync if no bitmap */
}
spin_lock_irq(&bitmap->counts.lock);
+
+ rv = false;
bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
- rv = 0;
if (bmc) {
/* locked */
- if (RESYNC(*bmc))
- rv = 1;
- else if (NEEDED(*bmc)) {
- rv = 1;
+ if (RESYNC(*bmc)) {
+ rv = true;
+ } else if (NEEDED(*bmc)) {
+ rv = true;
if (!degraded) { /* don't set/clear bits if degraded */
*bmc |= RESYNC_MASK;
*bmc &= ~NEEDED_MASK;
@@ -1603,11 +1800,12 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
}
}
spin_unlock_irq(&bitmap->counts.lock);
+
return rv;
}
-int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
- int degraded)
+static bool bitmap_start_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks, bool degraded)
{
/* bitmap_start_sync must always report on multiples of whole
* pages, otherwise resync (which is very PAGE_SIZE based) will
@@ -1616,21 +1814,22 @@ int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *block
* At least PAGE_SIZE>>9 blocks are covered.
* Return the 'or' of the result.
*/
- int rv = 0;
+ bool rv = false;
sector_t blocks1;
*blocks = 0;
while (*blocks < (PAGE_SIZE>>9)) {
- rv |= __bitmap_start_sync(bitmap, offset,
+ rv |= __bitmap_start_sync(mddev->bitmap, offset,
&blocks1, degraded);
offset += blocks1;
*blocks += blocks1;
}
+
return rv;
}
-EXPORT_SYMBOL(md_bitmap_start_sync);
-void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted)
+static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
+ sector_t *blocks, bool aborted)
{
bitmap_counter_t *bmc;
unsigned long flags;
@@ -1659,9 +1858,14 @@ void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks
unlock:
spin_unlock_irqrestore(&bitmap->counts.lock, flags);
}
-EXPORT_SYMBOL(md_bitmap_end_sync);
-void md_bitmap_close_sync(struct bitmap *bitmap)
+static void bitmap_end_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks)
+{
+ __bitmap_end_sync(mddev->bitmap, offset, blocks, true);
+}
+
+static void bitmap_close_sync(struct mddev *mddev)
{
/* Sync has finished, and any bitmap chunks that weren't synced
* properly have been aborted. It remains to us to clear the
@@ -1669,19 +1873,23 @@ void md_bitmap_close_sync(struct bitmap *bitmap)
*/
sector_t sector = 0;
sector_t blocks;
+ struct bitmap *bitmap = mddev->bitmap;
+
if (!bitmap)
return;
+
while (sector < bitmap->mddev->resync_max_sectors) {
- md_bitmap_end_sync(bitmap, sector, &blocks, 0);
+ __bitmap_end_sync(bitmap, sector, &blocks, false);
sector += blocks;
}
}
-EXPORT_SYMBOL(md_bitmap_close_sync);
-void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
+static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
+ bool force)
{
sector_t s = 0;
sector_t blocks;
+ struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return;
@@ -1700,34 +1908,32 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
s = 0;
while (s < sector && s < bitmap->mddev->resync_max_sectors) {
- md_bitmap_end_sync(bitmap, s, &blocks, 0);
+ __bitmap_end_sync(bitmap, s, &blocks, false);
s += blocks;
}
bitmap->last_end_sync = jiffies;
sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
}
-EXPORT_SYMBOL(md_bitmap_cond_end_sync);
-void md_bitmap_sync_with_cluster(struct mddev *mddev,
- sector_t old_lo, sector_t old_hi,
- sector_t new_lo, sector_t new_hi)
+static void bitmap_sync_with_cluster(struct mddev *mddev,
+ sector_t old_lo, sector_t old_hi,
+ sector_t new_lo, sector_t new_hi)
{
struct bitmap *bitmap = mddev->bitmap;
sector_t sector, blocks = 0;
for (sector = old_lo; sector < new_lo; ) {
- md_bitmap_end_sync(bitmap, sector, &blocks, 0);
+ __bitmap_end_sync(bitmap, sector, &blocks, false);
sector += blocks;
}
WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n");
for (sector = old_hi; sector < new_hi; ) {
- md_bitmap_start_sync(bitmap, sector, &blocks, 0);
+ bitmap_start_sync(mddev, sector, &blocks, false);
sector += blocks;
}
WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n");
}
-EXPORT_SYMBOL(md_bitmap_sync_with_cluster);
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
{
@@ -1756,12 +1962,18 @@ static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, in
}
/* dirty the memory and file bits for bitmap chunks "s" to "e" */
-void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
+static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s,
+ unsigned long e)
{
unsigned long chunk;
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return;
for (chunk = s; chunk <= e; chunk++) {
sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift;
+
md_bitmap_set_memory_bits(bitmap, sec, 1);
md_bitmap_file_set_bit(bitmap, sec);
if (sec < bitmap->mddev->recovery_cp)
@@ -1773,10 +1985,7 @@ void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long
}
}
-/*
- * flush out any pending updates
- */
-void md_bitmap_flush(struct mddev *mddev)
+static void bitmap_flush(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
long sleep;
@@ -1789,23 +1998,21 @@ void md_bitmap_flush(struct mddev *mddev)
*/
sleep = mddev->bitmap_info.daemon_sleep * 2;
bitmap->daemon_lastrun -= sleep;
- md_bitmap_daemon_work(mddev);
+ bitmap_daemon_work(mddev);
bitmap->daemon_lastrun -= sleep;
- md_bitmap_daemon_work(mddev);
+ bitmap_daemon_work(mddev);
bitmap->daemon_lastrun -= sleep;
- md_bitmap_daemon_work(mddev);
+ bitmap_daemon_work(mddev);
if (mddev->bitmap_info.external)
md_super_wait(mddev);
- md_bitmap_update_sb(bitmap);
+ bitmap_update_sb(bitmap);
}
-/*
- * free memory that was allocated
- */
-void md_bitmap_free(struct bitmap *bitmap)
+static void md_bitmap_free(void *data)
{
unsigned long k, pages;
struct bitmap_page *bp;
+ struct bitmap *bitmap = data;
if (!bitmap) /* there was no bitmap */
return;
@@ -1836,9 +2043,39 @@ void md_bitmap_free(struct bitmap *bitmap)
kfree(bp);
kfree(bitmap);
}
-EXPORT_SYMBOL(md_bitmap_free);
-void md_bitmap_wait_behind_writes(struct mddev *mddev)
+static void bitmap_start_behind_write(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ int bw;
+
+ if (!bitmap)
+ return;
+
+ atomic_inc(&bitmap->behind_writes);
+ bw = atomic_read(&bitmap->behind_writes);
+ if (bw > bitmap->behind_writes_used)
+ bitmap->behind_writes_used = bw;
+
+ pr_debug("inc write-behind count %d/%lu\n",
+ bw, bitmap->mddev->bitmap_info.max_write_behind);
+}
+
+static void bitmap_end_behind_write(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return;
+
+ if (atomic_dec_and_test(&bitmap->behind_writes))
+ wake_up(&bitmap->behind_wait);
+ pr_debug("dec write-behind count %d/%lu\n",
+ atomic_read(&bitmap->behind_writes),
+ bitmap->mddev->bitmap_info.max_write_behind);
+}
+
+static void bitmap_wait_behind_writes(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
@@ -1852,14 +2089,14 @@ void md_bitmap_wait_behind_writes(struct mddev *mddev)
}
}
-void md_bitmap_destroy(struct mddev *mddev)
+static void bitmap_destroy(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap) /* there was no bitmap */
return;
- md_bitmap_wait_behind_writes(mddev);
+ bitmap_wait_behind_writes(mddev);
if (!mddev->serialize_policy)
mddev_destroy_serial_pool(mddev, NULL);
@@ -1878,7 +2115,7 @@ void md_bitmap_destroy(struct mddev *mddev)
* if this returns an error, bitmap_destroy must be called to do clean up
* once mddev->bitmap is set
*/
-struct bitmap *md_bitmap_create(struct mddev *mddev, int slot)
+static struct bitmap *__bitmap_create(struct mddev *mddev, int slot)
{
struct bitmap *bitmap;
sector_t blocks = mddev->resync_max_sectors;
@@ -1948,7 +2185,8 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot)
goto error;
bitmap->daemon_lastrun = jiffies;
- err = md_bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1);
+ err = __bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize,
+ true);
if (err)
goto error;
@@ -1965,7 +2203,18 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot)
return ERR_PTR(err);
}
-int md_bitmap_load(struct mddev *mddev)
+static int bitmap_create(struct mddev *mddev, int slot)
+{
+ struct bitmap *bitmap = __bitmap_create(mddev, slot);
+
+ if (IS_ERR(bitmap))
+ return PTR_ERR(bitmap);
+
+ mddev->bitmap = bitmap;
+ return 0;
+}
+
+static int bitmap_load(struct mddev *mddev)
{
int err = 0;
sector_t start = 0;
@@ -1989,10 +2238,10 @@ int md_bitmap_load(struct mddev *mddev)
*/
while (sector < mddev->resync_max_sectors) {
sector_t blocks;
- md_bitmap_start_sync(bitmap, sector, &blocks, 0);
+ bitmap_start_sync(mddev, sector, &blocks, false);
sector += blocks;
}
- md_bitmap_close_sync(bitmap);
+ bitmap_close_sync(mddev);
if (mddev->degraded == 0
|| bitmap->events_cleared == mddev->events)
@@ -2014,22 +2263,21 @@ int md_bitmap_load(struct mddev *mddev)
mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
md_wakeup_thread(mddev->thread);
- md_bitmap_update_sb(bitmap);
+ bitmap_update_sb(bitmap);
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
err = -EIO;
out:
return err;
}
-EXPORT_SYMBOL_GPL(md_bitmap_load);
/* caller need to free returned bitmap with md_bitmap_free() */
-struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
+static void *bitmap_get_from_slot(struct mddev *mddev, int slot)
{
int rv = 0;
struct bitmap *bitmap;
- bitmap = md_bitmap_create(mddev, slot);
+ bitmap = __bitmap_create(mddev, slot);
if (IS_ERR(bitmap)) {
rv = PTR_ERR(bitmap);
return ERR_PTR(rv);
@@ -2043,20 +2291,19 @@ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
return bitmap;
}
-EXPORT_SYMBOL(get_bitmap_from_slot);
/* Loads the bitmap associated with slot and copies the resync information
* to our bitmap
*/
-int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
- sector_t *low, sector_t *high, bool clear_bits)
+static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low,
+ sector_t *high, bool clear_bits)
{
int rv = 0, i, j;
sector_t block, lo = 0, hi = 0;
struct bitmap_counts *counts;
struct bitmap *bitmap;
- bitmap = get_bitmap_from_slot(mddev, slot);
+ bitmap = bitmap_get_from_slot(mddev, slot);
if (IS_ERR(bitmap)) {
pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
return -1;
@@ -2076,53 +2323,62 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
}
if (clear_bits) {
- md_bitmap_update_sb(bitmap);
+ bitmap_update_sb(bitmap);
/* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs
* BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */
for (i = 0; i < bitmap->storage.file_pages; i++)
if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING))
set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
- md_bitmap_unplug(bitmap);
+ __bitmap_unplug(bitmap);
}
- md_bitmap_unplug(mddev->bitmap);
+ __bitmap_unplug(mddev->bitmap);
*low = lo;
*high = hi;
md_bitmap_free(bitmap);
return rv;
}
-EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot);
+static void bitmap_set_pages(void *data, unsigned long pages)
+{
+ struct bitmap *bitmap = data;
+
+ bitmap->counts.pages = pages;
+}
-void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
+static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
{
- unsigned long chunk_kb;
+ struct bitmap_storage *storage;
struct bitmap_counts *counts;
+ struct bitmap *bitmap = data;
+ bitmap_super_t *sb;
if (!bitmap)
- return;
+ return -ENOENT;
+ if (bitmap->mddev->bitmap_info.external)
+ return -ENOENT;
+ if (!bitmap->storage.sb_page) /* no superblock */
+ return -EINVAL;
+ sb = kmap_local_page(bitmap->storage.sb_page);
+ stats->sync_size = le64_to_cpu(sb->sync_size);
+ kunmap_local(sb);
counts = &bitmap->counts;
+ stats->missing_pages = counts->missing_pages;
+ stats->pages = counts->pages;
- chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
- seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
- "%lu%s chunk",
- counts->pages - counts->missing_pages,
- counts->pages,
- (counts->pages - counts->missing_pages)
- << (PAGE_SHIFT - 10),
- chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
- chunk_kb ? "KB" : "B");
- if (bitmap->storage.file) {
- seq_printf(seq, ", file: ");
- seq_file_path(seq, bitmap->storage.file, " \t\n");
- }
+ storage = &bitmap->storage;
+ stats->file_pages = storage->file_pages;
+ stats->file = storage->file;
- seq_printf(seq, "\n");
+ stats->behind_writes = atomic_read(&bitmap->behind_writes);
+ stats->behind_wait = wq_has_sleeper(&bitmap->behind_wait);
+ stats->events_cleared = bitmap->events_cleared;
+ return 0;
}
-int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
- int chunksize, int init)
+static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
+ int chunksize, bool init)
{
/* If chunk_size is 0, choose an appropriate chunk size.
* Then possibly allocate new storage space.
@@ -2320,14 +2576,24 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
spin_unlock_irq(&bitmap->counts.lock);
if (!init) {
- md_bitmap_unplug(bitmap);
+ __bitmap_unplug(bitmap);
bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
}
ret = 0;
err:
return ret;
}
-EXPORT_SYMBOL_GPL(md_bitmap_resize);
+
+static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize,
+ bool init)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return 0;
+
+ return __bitmap_resize(bitmap, blocks, chunksize, init);
+}
static ssize_t
location_show(struct mddev *mddev, char *page)
@@ -2367,7 +2633,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
goto out;
}
- md_bitmap_destroy(mddev);
+ bitmap_destroy(mddev);
mddev->bitmap_info.offset = 0;
if (mddev->bitmap_info.file) {
struct file *f = mddev->bitmap_info.file;
@@ -2377,7 +2643,6 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
} else {
/* No bitmap, OK to set a location */
long long offset;
- struct bitmap *bitmap;
if (strncmp(buf, "none", 4) == 0)
/* nothing to be done */;
@@ -2404,17 +2669,14 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
}
mddev->bitmap_info.offset = offset;
- bitmap = md_bitmap_create(mddev, -1);
- if (IS_ERR(bitmap)) {
- rv = PTR_ERR(bitmap);
+ rv = bitmap_create(mddev, -1);
+ if (rv)
goto out;
- }
- mddev->bitmap = bitmap;
- rv = md_bitmap_load(mddev);
+ rv = bitmap_load(mddev);
if (rv) {
mddev->bitmap_info.offset = 0;
- md_bitmap_destroy(mddev);
+ bitmap_destroy(mddev);
goto out;
}
}
@@ -2450,6 +2712,7 @@ space_show(struct mddev *mddev, char *page)
static ssize_t
space_store(struct mddev *mddev, const char *buf, size_t len)
{
+ struct bitmap *bitmap;
unsigned long sectors;
int rv;
@@ -2460,8 +2723,8 @@ space_store(struct mddev *mddev, const char *buf, size_t len)
if (sectors == 0)
return -EINVAL;
- if (mddev->bitmap &&
- sectors < (mddev->bitmap->storage.bytes + 511) >> 9)
+ bitmap = mddev->bitmap;
+ if (bitmap && sectors < (bitmap->storage.bytes + 511) >> 9)
return -EFBIG; /* Bitmap is too big for this small space */
/* could make sure it isn't too big, but that isn't really
@@ -2569,7 +2832,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
mddev_create_serial_pool(mddev, rdev);
}
if (old_mwb != backlog)
- md_bitmap_update_sb(mddev->bitmap);
+ bitmap_update_sb(mddev->bitmap);
mddev_unlock_and_resume(mddev);
return len;
@@ -2638,10 +2901,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
static ssize_t can_clear_show(struct mddev *mddev, char *page)
{
int len;
+ struct bitmap *bitmap;
+
spin_lock(&mddev->lock);
- if (mddev->bitmap)
- len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
- "false" : "true"));
+ bitmap = mddev->bitmap;
+ if (bitmap)
+ len = sprintf(page, "%s\n", (bitmap->need_sync ? "false" :
+ "true"));
else
len = sprintf(page, "\n");
spin_unlock(&mddev->lock);
@@ -2650,17 +2916,24 @@ static ssize_t can_clear_show(struct mddev *mddev, char *page)
static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len)
{
- if (mddev->bitmap == NULL)
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
return -ENOENT;
- if (strncmp(buf, "false", 5) == 0)
- mddev->bitmap->need_sync = 1;
- else if (strncmp(buf, "true", 4) == 0) {
+
+ if (strncmp(buf, "false", 5) == 0) {
+ bitmap->need_sync = 1;
+ return len;
+ }
+
+ if (strncmp(buf, "true", 4) == 0) {
if (mddev->degraded)
return -EBUSY;
- mddev->bitmap->need_sync = 0;
- } else
- return -EINVAL;
- return len;
+ bitmap->need_sync = 0;
+ return len;
+ }
+
+ return -EINVAL;
}
static struct md_sysfs_entry bitmap_can_clear =
@@ -2670,21 +2943,26 @@ static ssize_t
behind_writes_used_show(struct mddev *mddev, char *page)
{
ssize_t ret;
+ struct bitmap *bitmap;
+
spin_lock(&mddev->lock);
- if (mddev->bitmap == NULL)
+ bitmap = mddev->bitmap;
+ if (!bitmap)
ret = sprintf(page, "0\n");
else
- ret = sprintf(page, "%lu\n",
- mddev->bitmap->behind_writes_used);
+ ret = sprintf(page, "%lu\n", bitmap->behind_writes_used);
spin_unlock(&mddev->lock);
+
return ret;
}
static ssize_t
behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len)
{
- if (mddev->bitmap)
- mddev->bitmap->behind_writes_used = 0;
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (bitmap)
+ bitmap->behind_writes_used = 0;
return len;
}
@@ -2707,3 +2985,41 @@ const struct attribute_group md_bitmap_group = {
.name = "bitmap",
.attrs = md_bitmap_attrs,
};
+
+static struct bitmap_operations bitmap_ops = {
+ .enabled = bitmap_enabled,
+ .create = bitmap_create,
+ .resize = bitmap_resize,
+ .load = bitmap_load,
+ .destroy = bitmap_destroy,
+ .flush = bitmap_flush,
+ .write_all = bitmap_write_all,
+ .dirty_bits = bitmap_dirty_bits,
+ .unplug = bitmap_unplug,
+ .daemon_work = bitmap_daemon_work,
+
+ .start_behind_write = bitmap_start_behind_write,
+ .end_behind_write = bitmap_end_behind_write,
+ .wait_behind_writes = bitmap_wait_behind_writes,
+
+ .startwrite = bitmap_startwrite,
+ .endwrite = bitmap_endwrite,
+ .start_sync = bitmap_start_sync,
+ .end_sync = bitmap_end_sync,
+ .cond_end_sync = bitmap_cond_end_sync,
+ .close_sync = bitmap_close_sync,
+
+ .update_sb = bitmap_update_sb,
+ .get_stats = bitmap_get_stats,
+
+ .sync_with_cluster = bitmap_sync_with_cluster,
+ .get_from_slot = bitmap_get_from_slot,
+ .copy_from_slot = bitmap_copy_from_slot,
+ .set_pages = bitmap_set_pages,
+ .free = md_bitmap_free,
+};
+
+void mddev_set_bitmap_ops(struct mddev *mddev)
+{
+ mddev->bitmap_ops = &bitmap_ops;
+}
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index bb9eb418780a..31c93019c76b 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -7,81 +7,7 @@
#ifndef BITMAP_H
#define BITMAP_H 1
-#define BITMAP_MAJOR_LO 3
-/* version 4 insists the bitmap is in little-endian order
- * with version 3, it is host-endian which is non-portable
- * Version 5 is currently set only for clustered devices
- */
-#define BITMAP_MAJOR_HI 4
-#define BITMAP_MAJOR_CLUSTERED 5
-#define BITMAP_MAJOR_HOSTENDIAN 3
-
-/*
- * in-memory bitmap:
- *
- * Use 16 bit block counters to track pending writes to each "chunk".
- * The 2 high order bits are special-purpose, the first is a flag indicating
- * whether a resync is needed. The second is a flag indicating whether a
- * resync is active.
- * This means that the counter is actually 14 bits:
- *
- * +--------+--------+------------------------------------------------+
- * | resync | resync | counter |
- * | needed | active | |
- * | (0-1) | (0-1) | (0-16383) |
- * +--------+--------+------------------------------------------------+
- *
- * The "resync needed" bit is set when:
- * a '1' bit is read from storage at startup.
- * a write request fails on some drives
- * a resync is aborted on a chunk with 'resync active' set
- * It is cleared (and resync-active set) when a resync starts across all drives
- * of the chunk.
- *
- *
- * The "resync active" bit is set when:
- * a resync is started on all drives, and resync_needed is set.
- * resync_needed will be cleared (as long as resync_active wasn't already set).
- * It is cleared when a resync completes.
- *
- * The counter counts pending write requests, plus the on-disk bit.
- * When the counter is '1' and the resync bits are clear, the on-disk
- * bit can be cleared as well, thus setting the counter to 0.
- * When we set a bit, or in the counter (to start a write), if the fields is
- * 0, we first set the disk bit and set the counter to 1.
- *
- * If the counter is 0, the on-disk bit is clear and the stripe is clean
- * Anything that dirties the stripe pushes the counter to 2 (at least)
- * and sets the on-disk bit (lazily).
- * If a periodic sweep find the counter at 2, it is decremented to 1.
- * If the sweep find the counter at 1, the on-disk bit is cleared and the
- * counter goes to zero.
- *
- * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
- * counters as a fallback when "page" memory cannot be allocated:
- *
- * Normal case (page memory allocated):
- *
- * page pointer (32-bit)
- *
- * [ ] ------+
- * |
- * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
- * c1 c2 c2048
- *
- * Hijacked case (page memory allocation failed):
- *
- * hijacked page pointer (32-bit)
- *
- * [ ][ ] (no page memory allocated)
- * counter #1 (16-bit) counter #2 (16-bit)
- *
- */
-
-#ifdef __KERNEL__
-
-#define PAGE_BITS (PAGE_SIZE << 3)
-#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+#define BITMAP_MAGIC 0x6d746962
typedef __u16 bitmap_counter_t;
#define COUNTER_BITS 16
@@ -91,26 +17,6 @@ typedef __u16 bitmap_counter_t;
#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
-#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
-#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
-#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
-
-/* how many counters per page? */
-#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
-/* same, except a shift value for more efficient bitops */
-#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
-/* same, except a mask value for more efficient bitops */
-#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
-
-#define BITMAP_BLOCK_SHIFT 9
-
-#endif
-
-/*
- * bitmap structures:
- */
-
-#define BITMAP_MAGIC 0x6d746962
/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
@@ -152,136 +58,61 @@ typedef struct bitmap_super_s {
* devices. For raid10 it is the size of the array.
*/
-#ifdef __KERNEL__
+struct md_bitmap_stats {
+ u64 events_cleared;
+ int behind_writes;
+ bool behind_wait;
-/* the in-memory bitmap is represented by bitmap_pages */
-struct bitmap_page {
- /*
- * map points to the actual memory page
- */
- char *map;
- /*
- * in emergencies (when map cannot be alloced), hijack the map
- * pointer and use it as two counters itself
- */
- unsigned int hijacked:1;
- /*
- * If any counter in this page is '1' or '2' - and so could be
- * cleared then that page is marked as 'pending'
- */
- unsigned int pending:1;
- /*
- * count of dirty bits on the page
- */
- unsigned int count:30;
+ unsigned long missing_pages;
+ unsigned long file_pages;
+ unsigned long sync_size;
+ unsigned long pages;
+ struct file *file;
};
-/* the main bitmap structure - one per mddev */
-struct bitmap {
-
- struct bitmap_counts {
- spinlock_t lock;
- struct bitmap_page *bp;
- unsigned long pages; /* total number of pages
- * in the bitmap */
- unsigned long missing_pages; /* number of pages
- * not yet allocated */
- unsigned long chunkshift; /* chunksize = 2^chunkshift
- * (for bitops) */
- unsigned long chunks; /* Total number of data
- * chunks for the array */
- } counts;
-
- struct mddev *mddev; /* the md device that the bitmap is for */
-
- __u64 events_cleared;
- int need_sync;
-
- struct bitmap_storage {
- struct file *file; /* backing disk file */
- struct page *sb_page; /* cached copy of the bitmap
- * file superblock */
- unsigned long sb_index;
- struct page **filemap; /* list of cache pages for
- * the file */
- unsigned long *filemap_attr; /* attributes associated
- * w/ filemap pages */
- unsigned long file_pages; /* number of pages in the file*/
- unsigned long bytes; /* total bytes in the bitmap */
- } storage;
-
- unsigned long flags;
-
- int allclean;
-
- atomic_t behind_writes;
- unsigned long behind_writes_used; /* highest actual value at runtime */
-
- /*
- * the bitmap daemon - periodically wakes up and sweeps the bitmap
- * file, cleaning up bits and flushing out pages to disk as necessary
- */
- unsigned long daemon_lastrun; /* jiffies of last run */
- unsigned long last_end_sync; /* when we lasted called end_sync to
- * update bitmap with resync progress */
-
- atomic_t pending_writes; /* pending writes to the bitmap file */
- wait_queue_head_t write_wait;
- wait_queue_head_t overflow_wait;
- wait_queue_head_t behind_wait;
-
- struct kernfs_node *sysfs_can_clear;
- int cluster_slot; /* Slot offset for clustered env */
+struct bitmap_operations {
+ bool (*enabled)(struct mddev *mddev);
+ int (*create)(struct mddev *mddev, int slot);
+ int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize,
+ bool init);
+
+ int (*load)(struct mddev *mddev);
+ void (*destroy)(struct mddev *mddev);
+ void (*flush)(struct mddev *mddev);
+ void (*write_all)(struct mddev *mddev);
+ void (*dirty_bits)(struct mddev *mddev, unsigned long s,
+ unsigned long e);
+ void (*unplug)(struct mddev *mddev, bool sync);
+ void (*daemon_work)(struct mddev *mddev);
+
+ void (*start_behind_write)(struct mddev *mddev);
+ void (*end_behind_write)(struct mddev *mddev);
+ void (*wait_behind_writes)(struct mddev *mddev);
+
+ int (*startwrite)(struct mddev *mddev, sector_t offset,
+ unsigned long sectors);
+ void (*endwrite)(struct mddev *mddev, sector_t offset,
+ unsigned long sectors);
+ bool (*start_sync)(struct mddev *mddev, sector_t offset,
+ sector_t *blocks, bool degraded);
+ void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
+ void (*cond_end_sync)(struct mddev *mddev, sector_t sector, bool force);
+ void (*close_sync)(struct mddev *mddev);
+
+ void (*update_sb)(void *data);
+ int (*get_stats)(void *data, struct md_bitmap_stats *stats);
+
+ void (*sync_with_cluster)(struct mddev *mddev,
+ sector_t old_lo, sector_t old_hi,
+ sector_t new_lo, sector_t new_hi);
+ void *(*get_from_slot)(struct mddev *mddev, int slot);
+ int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo,
+ sector_t *hi, bool clear_bits);
+ void (*set_pages)(void *data, unsigned long pages);
+ void (*free)(void *data);
};
/* the bitmap API */
-
-/* these are used only by md/bitmap */
-struct bitmap *md_bitmap_create(struct mddev *mddev, int slot);
-int md_bitmap_load(struct mddev *mddev);
-void md_bitmap_flush(struct mddev *mddev);
-void md_bitmap_destroy(struct mddev *mddev);
-
-void md_bitmap_print_sb(struct bitmap *bitmap);
-void md_bitmap_update_sb(struct bitmap *bitmap);
-void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
-
-int md_bitmap_setallbits(struct bitmap *bitmap);
-void md_bitmap_write_all(struct bitmap *bitmap);
-
-void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
-
-/* these are exported */
-int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
- unsigned long sectors, int behind);
-void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
- unsigned long sectors, int success, int behind);
-int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
-void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
-void md_bitmap_close_sync(struct bitmap *bitmap);
-void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
-void md_bitmap_sync_with_cluster(struct mddev *mddev,
- sector_t old_lo, sector_t old_hi,
- sector_t new_lo, sector_t new_hi);
-
-void md_bitmap_unplug(struct bitmap *bitmap);
-void md_bitmap_unplug_async(struct bitmap *bitmap);
-void md_bitmap_daemon_work(struct mddev *mddev);
-
-int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
- int chunksize, int init);
-struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
-int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
- sector_t *lo, sector_t *hi, bool clear_bits);
-void md_bitmap_free(struct bitmap *bitmap);
-void md_bitmap_wait_behind_writes(struct mddev *mddev);
-
-static inline bool md_bitmap_enabled(struct bitmap *bitmap)
-{
- return bitmap && bitmap->storage.filemap &&
- !test_bit(BITMAP_STALE, &bitmap->flags);
-}
-
-#endif
+void mddev_set_bitmap_ops(struct mddev *mddev);
#endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 8e36a0feec09..6595f89becdb 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -15,6 +15,7 @@
#define LVB_SIZE 64
#define NEW_DEV_TIMEOUT 5000
+#define WAIT_DLM_LOCK_TIMEOUT (30 * HZ)
struct dlm_lock_resource {
dlm_lockspace_t *ls;
@@ -56,6 +57,7 @@ struct resync_info {
#define MD_CLUSTER_ALREADY_IN_CLUSTER 6
#define MD_CLUSTER_PENDING_RECV_EVENT 7
#define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8
+#define MD_CLUSTER_WAITING_FOR_SYNC 9
struct md_cluster_info {
struct mddev *mddev; /* the md device which md_cluster_info belongs to */
@@ -91,6 +93,7 @@ struct md_cluster_info {
sector_t sync_hi;
};
+/* For compatibility, add the new msg_type at the end. */
enum msg_type {
METADATA_UPDATED = 0,
RESYNCING,
@@ -100,6 +103,7 @@ enum msg_type {
BITMAP_NEEDS_SYNC,
CHANGE_CAPACITY,
BITMAP_RESIZE,
+ RESYNCING_START,
};
struct cluster_msg {
@@ -130,8 +134,13 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
0, sync_ast, res, res->bast);
if (ret)
return ret;
- wait_event(res->sync_locking, res->sync_locking_done);
+ ret = wait_event_timeout(res->sync_locking, res->sync_locking_done,
+ WAIT_DLM_LOCK_TIMEOUT);
res->sync_locking_done = false;
+ if (!ret) {
+ pr_err("locking DLM '%s' timeout!\n", res->name);
+ return -EBUSY;
+ }
if (res->lksb.sb_status == 0)
res->mode = mode;
return res->lksb.sb_status;
@@ -308,7 +317,7 @@ static void recover_bitmaps(struct md_thread *thread)
str, ret);
goto clear_bit;
}
- ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
+ ret = mddev->bitmap_ops->copy_from_slot(mddev, slot, &lo, &hi, true);
if (ret) {
pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
goto clear_bit;
@@ -455,6 +464,7 @@ static void process_suspend_info(struct mddev *mddev,
clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
remove_suspend_info(mddev, slot);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
md_wakeup_thread(mddev->thread);
return;
}
@@ -487,8 +497,8 @@ static void process_suspend_info(struct mddev *mddev,
* we don't want to trigger lots of WARN.
*/
if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE))
- md_bitmap_sync_with_cluster(mddev, cinfo->sync_low,
- cinfo->sync_hi, lo, hi);
+ mddev->bitmap_ops->sync_with_cluster(mddev, cinfo->sync_low,
+ cinfo->sync_hi, lo, hi);
cinfo->sync_low = lo;
cinfo->sync_hi = hi;
@@ -525,6 +535,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
res = -1;
}
clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
+ set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
return res;
}
@@ -593,6 +604,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
case CHANGE_CAPACITY:
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
break;
+ case RESYNCING_START:
+ clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state);
+ break;
case RESYNCING:
set_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
process_suspend_info(mddev, le32_to_cpu(msg->slot),
@@ -614,8 +628,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
break;
case BITMAP_RESIZE:
if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
- ret = md_bitmap_resize(mddev->bitmap,
- le64_to_cpu(msg->high), 0, 0);
+ ret = mddev->bitmap_ops->resize(mddev,
+ le64_to_cpu(msg->high),
+ 0, false);
break;
default:
ret = -1;
@@ -743,7 +758,7 @@ static void unlock_comm(struct md_cluster_info *cinfo)
*/
static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
{
- int error;
+ int error, unlock_error;
int slot = cinfo->slot_number - 1;
cmsg->slot = cpu_to_le32(slot);
@@ -751,7 +766,7 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
if (error) {
pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
- goto failed_message;
+ return error;
}
memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
@@ -781,14 +796,10 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
}
failed_ack:
- error = dlm_unlock_sync(cinfo->message_lockres);
- if (unlikely(error != 0)) {
+ while ((unlock_error = dlm_unlock_sync(cinfo->message_lockres)))
pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
- error);
- /* in case the message can't be released due to some reason */
- goto failed_ack;
- }
-failed_message:
+ unlock_error);
+
return error;
}
@@ -846,7 +857,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
}
/* Read the disk bitmap sb and check if it needs recovery */
- ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
+ ret = mddev->bitmap_ops->copy_from_slot(mddev, i, &lo, &hi, false);
if (ret) {
pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
lockres_free(bm_lockres);
@@ -887,7 +898,7 @@ static int join(struct mddev *mddev, int nodes)
memset(str, 0, 64);
sprintf(str, "%pU", mddev->uuid);
ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
- 0, LVB_SIZE, &md_ls_ops, mddev,
+ DLM_LSFL_SOFTIRQ, LVB_SIZE, &md_ls_ops, mddev,
&ops_rv, &cinfo->lockspace);
if (ret)
goto err;
@@ -1133,13 +1144,16 @@ static int update_bitmap_size(struct mddev *mddev, sector_t size)
static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize)
{
- struct bitmap_counts *counts;
- char str[64];
- struct dlm_lock_resource *bm_lockres;
- struct bitmap *bitmap = mddev->bitmap;
- unsigned long my_pages = bitmap->counts.pages;
+ void *bitmap = mddev->bitmap;
+ struct md_bitmap_stats stats;
+ unsigned long my_pages;
int i, rv;
+ rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
+ if (rv)
+ return rv;
+
+ my_pages = stats.pages;
/*
* We need to ensure all the nodes can grow to a larger
* bitmap size before make the reshaping.
@@ -1149,17 +1163,22 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
return rv;
for (i = 0; i < mddev->bitmap_info.nodes; i++) {
+ struct dlm_lock_resource *bm_lockres;
+ char str[64];
+
if (i == md_cluster_ops->slot_number(mddev))
continue;
- bitmap = get_bitmap_from_slot(mddev, i);
+ bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
if (IS_ERR(bitmap)) {
pr_err("can't get bitmap from slot %d\n", i);
bitmap = NULL;
goto out;
}
- counts = &bitmap->counts;
+ rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
+ if (rv)
+ goto out;
/*
* If we can hold the bitmap lock of one node then
* the slot is not occupied, update the pages.
@@ -1173,21 +1192,21 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
bm_lockres->flags |= DLM_LKF_NOQUEUE;
rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
if (!rv)
- counts->pages = my_pages;
+ mddev->bitmap_ops->set_pages(bitmap, my_pages);
lockres_free(bm_lockres);
- if (my_pages != counts->pages)
+ if (my_pages != stats.pages)
/*
* Let's revert the bitmap size if one node
* can't resize bitmap
*/
goto out;
- md_bitmap_free(bitmap);
+ mddev->bitmap_ops->free(bitmap);
}
return 0;
out:
- md_bitmap_free(bitmap);
+ mddev->bitmap_ops->free(bitmap);
update_bitmap_size(mddev, oldsize);
return -1;
}
@@ -1197,24 +1216,27 @@ out:
*/
static int cluster_check_sync_size(struct mddev *mddev)
{
- int i, rv;
- bitmap_super_t *sb;
- unsigned long my_sync_size, sync_size = 0;
- int node_num = mddev->bitmap_info.nodes;
int current_slot = md_cluster_ops->slot_number(mddev);
- struct bitmap *bitmap = mddev->bitmap;
- char str[64];
+ int node_num = mddev->bitmap_info.nodes;
struct dlm_lock_resource *bm_lockres;
+ struct md_bitmap_stats stats;
+ void *bitmap = mddev->bitmap;
+ unsigned long sync_size = 0;
+ unsigned long my_sync_size;
+ char str[64];
+ int i, rv;
- sb = kmap_atomic(bitmap->storage.sb_page);
- my_sync_size = sb->sync_size;
- kunmap_atomic(sb);
+ rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
+ if (rv)
+ return rv;
+
+ my_sync_size = stats.sync_size;
for (i = 0; i < node_num; i++) {
if (i == current_slot)
continue;
- bitmap = get_bitmap_from_slot(mddev, i);
+ bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
if (IS_ERR(bitmap)) {
pr_err("can't get bitmap from slot %d\n", i);
return -1;
@@ -1228,25 +1250,28 @@ static int cluster_check_sync_size(struct mddev *mddev)
bm_lockres = lockres_init(mddev, str, NULL, 1);
if (!bm_lockres) {
pr_err("md-cluster: Cannot initialize %s\n", str);
- md_bitmap_free(bitmap);
+ mddev->bitmap_ops->free(bitmap);
return -1;
}
bm_lockres->flags |= DLM_LKF_NOQUEUE;
rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
if (!rv)
- md_bitmap_update_sb(bitmap);
+ mddev->bitmap_ops->update_sb(bitmap);
lockres_free(bm_lockres);
- sb = kmap_atomic(bitmap->storage.sb_page);
- if (sync_size == 0)
- sync_size = sb->sync_size;
- else if (sync_size != sb->sync_size) {
- kunmap_atomic(sb);
- md_bitmap_free(bitmap);
+ rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
+ if (rv) {
+ mddev->bitmap_ops->free(bitmap);
+ return rv;
+ }
+
+ if (sync_size == 0) {
+ sync_size = stats.sync_size;
+ } else if (sync_size != stats.sync_size) {
+ mddev->bitmap_ops->free(bitmap);
return -1;
}
- kunmap_atomic(sb);
- md_bitmap_free(bitmap);
+ mddev->bitmap_ops->free(bitmap);
}
return (my_sync_size == sync_size) ? 0 : -1;
@@ -1343,6 +1368,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi)
spin_unlock_irq(&cinfo->suspend_lock);
}
+static int resync_status_get(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
+}
+
+static int resync_start_notify(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg = {0};
+
+ cmsg.type = cpu_to_le32(RESYNCING_START);
+
+ return sendmsg(cinfo, &cmsg, 0);
+}
+
static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1558,7 +1600,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
if (sn == (cinfo->slot_number - 1))
continue;
- err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
+ err = mddev->bitmap_ops->copy_from_slot(mddev, sn, &lo, &hi, false);
if (err) {
pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
goto out;
@@ -1570,13 +1612,15 @@ out:
return err;
}
-static struct md_cluster_operations cluster_ops = {
+static const struct md_cluster_operations cluster_ops = {
.join = join,
.leave = leave,
.slot_number = slot_number,
.resync_start = resync_start,
.resync_finish = resync_finish,
.resync_info_update = resync_info_update,
+ .resync_start_notify = resync_start_notify,
+ .resync_status_get = resync_status_get,
.resync_info_get = resync_info_get,
.metadata_update_start = metadata_update_start,
.metadata_update_finish = metadata_update_finish,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index a78e3021775d..470bf18ffde5 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -14,6 +14,8 @@ struct md_cluster_operations {
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
+ int (*resync_start_notify)(struct mddev *mddev);
+ int (*resync_status_get)(struct mddev *mddev);
void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi);
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
new file mode 100644
index 000000000000..369aed044b40
--- /dev/null
+++ b/drivers/md/md-linear.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc
+ * ZYNGIER <zyngier@ufr-info-p7.ibp.fr> or <maz@gloups.fdn.fr>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <trace/events/block.h>
+#include "md.h"
+
+struct dev_info {
+ struct md_rdev *rdev;
+ sector_t end_sector;
+};
+
+struct linear_conf {
+ struct rcu_head rcu;
+ sector_t array_sectors;
+ /* a copy of mddev->raid_disks */
+ int raid_disks;
+ struct dev_info disks[] __counted_by(raid_disks);
+};
+
+/*
+ * find which device holds a particular offset
+ */
+static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
+{
+ int lo, mid, hi;
+ struct linear_conf *conf;
+
+ lo = 0;
+ hi = mddev->raid_disks - 1;
+ conf = mddev->private;
+
+ /*
+ * Binary Search
+ */
+
+ while (hi > lo) {
+
+ mid = (hi + lo) / 2;
+ if (sector < conf->disks[mid].end_sector)
+ hi = mid;
+ else
+ lo = mid + 1;
+ }
+
+ return conf->disks + lo;
+}
+
+static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+{
+ struct linear_conf *conf;
+ sector_t array_sectors;
+
+ conf = mddev->private;
+ WARN_ONCE(sectors || raid_disks,
+ "%s does not support generic reshape\n", __func__);
+ array_sectors = conf->array_sectors;
+
+ return array_sectors;
+}
+
+static int linear_set_limits(struct mddev *mddev)
+{
+ struct queue_limits lim;
+ int err;
+
+ md_init_stacking_limits(&lim);
+ lim.max_hw_sectors = mddev->chunk_sectors;
+ lim.max_write_zeroes_sectors = mddev->chunk_sectors;
+ lim.io_min = mddev->chunk_sectors << 9;
+ err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
+ if (err)
+ return err;
+
+ return queue_limits_set(mddev->gendisk->queue, &lim);
+}
+
+static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
+{
+ struct linear_conf *conf;
+ struct md_rdev *rdev;
+ int ret = -EINVAL;
+ int cnt;
+ int i;
+
+ conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
+ if (!conf)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * conf->raid_disks is copy of mddev->raid_disks. The reason to
+ * keep a copy of mddev->raid_disks in struct linear_conf is,
+ * mddev->raid_disks may not be consistent with pointers number of
+ * conf->disks[] when it is updated in linear_add() and used to
+ * iterate old conf->disks[] earray in linear_congested().
+ * Here conf->raid_disks is always consitent with number of
+ * pointers in conf->disks[] array, and mddev->private is updated
+ * with rcu_assign_pointer() in linear_addr(), such race can be
+ * avoided.
+ */
+ conf->raid_disks = raid_disks;
+
+ cnt = 0;
+ conf->array_sectors = 0;
+
+ rdev_for_each(rdev, mddev) {
+ int j = rdev->raid_disk;
+ struct dev_info *disk = conf->disks + j;
+ sector_t sectors;
+
+ if (j < 0 || j >= raid_disks || disk->rdev) {
+ pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
+ mdname(mddev));
+ goto out;
+ }
+
+ disk->rdev = rdev;
+ if (mddev->chunk_sectors) {
+ sectors = rdev->sectors;
+ sector_div(sectors, mddev->chunk_sectors);
+ rdev->sectors = sectors * mddev->chunk_sectors;
+ }
+
+ conf->array_sectors += rdev->sectors;
+ cnt++;
+ }
+ if (cnt != raid_disks) {
+ pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
+ mdname(mddev));
+ goto out;
+ }
+
+ /*
+ * Here we calculate the device offsets.
+ */
+ conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
+
+ for (i = 1; i < raid_disks; i++)
+ conf->disks[i].end_sector =
+ conf->disks[i-1].end_sector +
+ conf->disks[i].rdev->sectors;
+
+ if (!mddev_is_dm(mddev)) {
+ ret = linear_set_limits(mddev);
+ if (ret)
+ goto out;
+ }
+
+ return conf;
+
+out:
+ kfree(conf);
+ return ERR_PTR(ret);
+}
+
+static int linear_run(struct mddev *mddev)
+{
+ struct linear_conf *conf;
+ int ret;
+
+ if (md_check_no_bitmap(mddev))
+ return -EINVAL;
+
+ conf = linear_conf(mddev, mddev->raid_disks);
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
+
+ mddev->private = conf;
+ md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+
+ ret = md_integrity_register(mddev);
+ if (ret) {
+ kfree(conf);
+ mddev->private = NULL;
+ }
+ return ret;
+}
+
+static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
+{
+ /* Adding a drive to a linear array allows the array to grow.
+ * It is permitted if the new drive has a matching superblock
+ * already on it, with raid_disk equal to raid_disks.
+ * It is achieved by creating a new linear_private_data structure
+ * and swapping it in in-place of the current one.
+ * The current one is never freed until the array is stopped.
+ * This avoids races.
+ */
+ struct linear_conf *newconf, *oldconf;
+
+ if (rdev->saved_raid_disk != mddev->raid_disks)
+ return -EINVAL;
+
+ rdev->raid_disk = rdev->saved_raid_disk;
+ rdev->saved_raid_disk = -1;
+
+ newconf = linear_conf(mddev, mddev->raid_disks + 1);
+ if (IS_ERR(newconf))
+ return PTR_ERR(newconf);
+
+ /* newconf->raid_disks already keeps a copy of * the increased
+ * value of mddev->raid_disks, WARN_ONCE() is just used to make
+ * sure of this. It is possible that oldconf is still referenced
+ * in linear_congested(), therefore kfree_rcu() is used to free
+ * oldconf until no one uses it anymore.
+ */
+ oldconf = rcu_dereference_protected(mddev->private,
+ lockdep_is_held(&mddev->reconfig_mutex));
+ mddev->raid_disks++;
+ WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
+ "copied raid_disks doesn't match mddev->raid_disks");
+ rcu_assign_pointer(mddev->private, newconf);
+ md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+ set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
+ kfree_rcu(oldconf, rcu);
+ return 0;
+}
+
+static void linear_free(struct mddev *mddev, void *priv)
+{
+ struct linear_conf *conf = priv;
+
+ kfree(conf);
+}
+
+static bool linear_make_request(struct mddev *mddev, struct bio *bio)
+{
+ struct dev_info *tmp_dev;
+ sector_t start_sector, end_sector, data_offset;
+ sector_t bio_sector = bio->bi_iter.bi_sector;
+
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
+ return true;
+
+ tmp_dev = which_dev(mddev, bio_sector);
+ start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+ end_sector = tmp_dev->end_sector;
+ data_offset = tmp_dev->rdev->data_offset;
+
+ if (unlikely(bio_sector >= end_sector ||
+ bio_sector < start_sector))
+ goto out_of_bounds;
+
+ if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
+ md_error(mddev, tmp_dev->rdev);
+ bio_io_error(bio);
+ return true;
+ }
+
+ if (unlikely(bio_end_sector(bio) > end_sector)) {
+ /* This bio crosses a device boundary, so we have to split it */
+ struct bio *split = bio_split(bio, end_sector - bio_sector,
+ GFP_NOIO, &mddev->bio_set);
+
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return true;
+ }
+
+ bio_chain(split, bio);
+ submit_bio_noacct(bio);
+ bio = split;
+ }
+
+ md_account_bio(mddev, &bio);
+ bio_set_dev(bio, tmp_dev->rdev->bdev);
+ bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
+ start_sector + data_offset;
+
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !bdev_max_discard_sectors(bio->bi_bdev))) {
+ /* Just ignore it */
+ bio_endio(bio);
+ } else {
+ if (mddev->gendisk)
+ trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_write_zeroes(mddev, bio);
+ submit_bio_noacct(bio);
+ }
+ return true;
+
+out_of_bounds:
+ pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n",
+ mdname(mddev),
+ (unsigned long long)bio->bi_iter.bi_sector,
+ tmp_dev->rdev->bdev,
+ (unsigned long long)tmp_dev->rdev->sectors,
+ (unsigned long long)start_sector);
+ bio_io_error(bio);
+ return true;
+}
+
+static void linear_status(struct seq_file *seq, struct mddev *mddev)
+{
+ seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
+}
+
+static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
+{
+ if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
+ char *md_name = mdname(mddev);
+
+ pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
+ md_name, rdev->bdev);
+ }
+}
+
+static void linear_quiesce(struct mddev *mddev, int state)
+{
+}
+
+static struct md_personality linear_personality = {
+ .name = "linear",
+ .level = LEVEL_LINEAR,
+ .owner = THIS_MODULE,
+ .make_request = linear_make_request,
+ .run = linear_run,
+ .free = linear_free,
+ .status = linear_status,
+ .hot_add_disk = linear_add,
+ .size = linear_size,
+ .quiesce = linear_quiesce,
+ .error_handler = linear_error,
+};
+
+static int __init linear_init(void)
+{
+ return register_md_personality(&linear_personality);
+}
+
+static void linear_exit(void)
+{
+ unregister_md_personality(&linear_personality);
+}
+
+module_init(linear_init);
+module_exit(linear_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)");
+MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
+MODULE_ALIAS("md-linear");
+MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e575e74aabf5..30b3dbbce2d2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -69,13 +69,23 @@
#include "md-bitmap.h"
#include "md-cluster.h"
+static const char *action_name[NR_SYNC_ACTIONS] = {
+ [ACTION_RESYNC] = "resync",
+ [ACTION_RECOVER] = "recover",
+ [ACTION_CHECK] = "check",
+ [ACTION_REPAIR] = "repair",
+ [ACTION_RESHAPE] = "reshape",
+ [ACTION_FROZEN] = "frozen",
+ [ACTION_IDLE] = "idle",
+};
+
/* pers_list is a list of registered personalities protected by pers_lock. */
static LIST_HEAD(pers_list);
static DEFINE_SPINLOCK(pers_lock);
static const struct kobj_type md_ktype;
-struct md_cluster_operations *md_cluster_ops;
+const struct md_cluster_operations *md_cluster_ops;
EXPORT_SYMBOL(md_cluster_ops);
static struct module *md_cluster_mod;
@@ -284,7 +294,7 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
static struct ctl_table_header *raid_table_header;
-static struct ctl_table raid_table[] = {
+static const struct ctl_table raid_table[] = {
{
.procname = "speed_limit_min",
.data = &sysctl_speed_limit_min,
@@ -479,7 +489,6 @@ int mddev_suspend(struct mddev *mddev, bool interruptible)
*/
WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
- del_timer_sync(&mddev->safemode_timer);
/* restrict memory reclaim I/O during raid array is suspend */
mddev->noio_flag = memalloc_noio_save();
@@ -538,140 +547,57 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n
}
/*
- * Generic flush handling for md
+ * The only difference from bio_chain_endio() is that the current
+ * bi_status of bio does not affect the bi_status of parent.
*/
-
static void md_end_flush(struct bio *bio)
{
- struct md_rdev *rdev = bio->bi_private;
- struct mddev *mddev = rdev->mddev;
+ struct bio *parent = bio->bi_private;
- bio_put(bio);
-
- rdev_dec_pending(rdev, mddev);
-
- if (atomic_dec_and_test(&mddev->flush_pending)) {
- /* The pair is percpu_ref_get() from md_flush_request() */
- percpu_ref_put(&mddev->active_io);
+ /*
+ * If any flush io error before the power failure,
+ * disk data may be lost.
+ */
+ if (bio->bi_status)
+ pr_err("md: %pg flush io error %d\n", bio->bi_bdev,
+ blk_status_to_errno(bio->bi_status));
- /* The pre-request flush has finished */
- queue_work(md_wq, &mddev->flush_work);
- }
+ bio_put(bio);
+ bio_endio(parent);
}
-static void md_submit_flush_data(struct work_struct *ws);
-
-static void submit_flushes(struct work_struct *ws)
+bool md_flush_request(struct mddev *mddev, struct bio *bio)
{
- struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct md_rdev *rdev;
-
- mddev->start_flush = ktime_get_boottime();
- INIT_WORK(&mddev->flush_work, md_submit_flush_data);
- atomic_set(&mddev->flush_pending, 1);
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev)
- if (rdev->raid_disk >= 0 &&
- !test_bit(Faulty, &rdev->flags)) {
- struct bio *bi;
-
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- bi = bio_alloc_bioset(rdev->bdev, 0,
- REQ_OP_WRITE | REQ_PREFLUSH,
- GFP_NOIO, &mddev->bio_set);
- bi->bi_end_io = md_end_flush;
- bi->bi_private = rdev;
- atomic_inc(&mddev->flush_pending);
- submit_bio(bi);
- rcu_read_lock();
- }
- rcu_read_unlock();
- if (atomic_dec_and_test(&mddev->flush_pending)) {
- /* The pair is percpu_ref_get() from md_flush_request() */
- percpu_ref_put(&mddev->active_io);
-
- queue_work(md_wq, &mddev->flush_work);
- }
-}
-
-static void md_submit_flush_data(struct work_struct *ws)
-{
- struct mddev *mddev = container_of(ws, struct mddev, flush_work);
- struct bio *bio = mddev->flush_bio;
+ struct bio *new;
/*
- * must reset flush_bio before calling into md_handle_request to avoid a
- * deadlock, because other bios passed md_handle_request suspend check
- * could wait for this and below md_handle_request could wait for those
- * bios because of suspend check
+ * md_flush_reqeust() should be called under md_handle_request() and
+ * 'active_io' is already grabbed. Hence it's safe to get rdev directly
+ * without rcu protection.
*/
- spin_lock_irq(&mddev->lock);
- mddev->prev_flush_start = mddev->start_flush;
- mddev->flush_bio = NULL;
- spin_unlock_irq(&mddev->lock);
- wake_up(&mddev->sb_wait);
+ WARN_ON(percpu_ref_is_zero(&mddev->active_io));
- if (bio->bi_iter.bi_size == 0) {
- /* an empty barrier - all done */
- bio_endio(bio);
- } else {
- bio->bi_opf &= ~REQ_PREFLUSH;
- md_handle_request(mddev, bio);
- }
-}
+ rdev_for_each(rdev, mddev) {
+ if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+ continue;
-/*
- * Manages consolidation of flushes and submitting any flushes needed for
- * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is
- * being finished in another context. Returns false if the flushing is
- * complete but still needs the I/O portion of the bio to be processed.
- */
-bool md_flush_request(struct mddev *mddev, struct bio *bio)
-{
- ktime_t req_start = ktime_get_boottime();
- spin_lock_irq(&mddev->lock);
- /* flush requests wait until ongoing flush completes,
- * hence coalescing all the pending requests.
- */
- wait_event_lock_irq(mddev->sb_wait,
- !mddev->flush_bio ||
- ktime_before(req_start, mddev->prev_flush_start),
- mddev->lock);
- /* new request after previous flush is completed */
- if (ktime_after(req_start, mddev->prev_flush_start)) {
- WARN_ON(mddev->flush_bio);
- /*
- * Grab a reference to make sure mddev_suspend() will wait for
- * this flush to be done.
- *
- * md_flush_reqeust() is called under md_handle_request() and
- * 'active_io' is already grabbed, hence percpu_ref_is_zero()
- * won't pass, percpu_ref_tryget_live() can't be used because
- * percpu_ref_kill() can be called by mddev_suspend()
- * concurrently.
- */
- WARN_ON(percpu_ref_is_zero(&mddev->active_io));
- percpu_ref_get(&mddev->active_io);
- mddev->flush_bio = bio;
- bio = NULL;
+ new = bio_alloc_bioset(rdev->bdev, 0,
+ REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO,
+ &mddev->bio_set);
+ new->bi_private = bio;
+ new->bi_end_io = md_end_flush;
+ bio_inc_remaining(bio);
+ submit_bio(new);
}
- spin_unlock_irq(&mddev->lock);
- if (!bio) {
- INIT_WORK(&mddev->flush_work, submit_flushes);
- queue_work(md_wq, &mddev->flush_work);
- } else {
- /* flush was performed for some other bio while we waited. */
- if (bio->bi_iter.bi_size == 0)
- /* an empty barrier - all done */
- bio_endio(bio);
- else {
- bio->bi_opf &= ~REQ_PREFLUSH;
- return false;
- }
+ if (bio_sectors(bio) == 0) {
+ bio_endio(bio);
+ return true;
}
- return true;
+
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ return false;
}
EXPORT_SYMBOL(md_flush_request);
@@ -742,7 +668,6 @@ int mddev_init(struct mddev *mddev)
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
- mutex_init(&mddev->sync_mutex);
mutex_init(&mddev->suspend_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
@@ -753,15 +678,15 @@ int mddev_init(struct mddev *mddev)
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->sync_seq, 0);
spin_lock_init(&mddev->lock);
- atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
mddev->reshape_backwards = 0;
- mddev->last_sync_action = "none";
+ mddev->last_sync_action = ACTION_IDLE;
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
+ mddev_set_bitmap_ops(mddev);
INIT_WORK(&mddev->sync_work, md_start_sync);
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
@@ -1362,6 +1287,18 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
return ret;
}
+static u64 md_bitmap_events_cleared(struct mddev *mddev)
+{
+ struct md_bitmap_stats stats;
+ int err;
+
+ err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (err)
+ return 0;
+
+ return stats.events_cleared;
+}
+
/*
* validate_super for 0.90.0
* note: we are not using "freshest" for 0.9 superblock
@@ -1454,7 +1391,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru
/* if adding to array with a bitmap, then we can accept an
* older device ... but not too old.
*/
- if (ev1 < mddev->bitmap->events_cleared)
+ if (ev1 < md_bitmap_events_cleared(mddev))
return 0;
if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
@@ -1981,7 +1918,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
/* If adding to array with a bitmap, then we can accept an
* older device, but not too old.
*/
- if (ev1 < mddev->bitmap->events_cleared)
+ if (ev1 < md_bitmap_events_cleared(mddev))
return 0;
if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
@@ -2313,7 +2250,6 @@ super_1_allow_new_offset(struct md_rdev *rdev,
unsigned long long new_offset)
{
/* All necessary checks on new >= old have been done */
- struct bitmap *bitmap;
if (new_offset >= rdev->data_offset)
return 1;
@@ -2330,11 +2266,18 @@ super_1_allow_new_offset(struct md_rdev *rdev,
*/
if (rdev->sb_start + (32+4)*2 > new_offset)
return 0;
- bitmap = rdev->mddev->bitmap;
- if (bitmap && !rdev->mddev->bitmap_info.file &&
- rdev->sb_start + rdev->mddev->bitmap_info.offset +
- bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
- return 0;
+
+ if (!rdev->mddev->bitmap_info.file) {
+ struct mddev *mddev = rdev->mddev;
+ struct md_bitmap_stats stats;
+ int err;
+
+ err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (!err && rdev->sb_start + mddev->bitmap_info.offset +
+ stats.file_pages * (PAGE_SIZE >> 9) > new_offset)
+ return 0;
+ }
+
if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
return 0;
@@ -2410,36 +2353,10 @@ static LIST_HEAD(pending_raid_disks);
*/
int md_integrity_register(struct mddev *mddev)
{
- struct md_rdev *rdev, *reference = NULL;
-
if (list_empty(&mddev->disks))
return 0; /* nothing to do */
- if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk))
- return 0; /* shouldn't register, or already is */
- rdev_for_each(rdev, mddev) {
- /* skip spares and non-functional disks */
- if (test_bit(Faulty, &rdev->flags))
- continue;
- if (rdev->raid_disk < 0)
- continue;
- if (!reference) {
- /* Use the first rdev as the reference */
- reference = rdev;
- continue;
- }
- /* does this rdev's profile match the reference profile? */
- if (blk_integrity_compare(reference->bdev->bd_disk,
- rdev->bdev->bd_disk) < 0)
- return -EINVAL;
- }
- if (!reference || !bdev_get_integrity(reference->bdev))
- return 0;
- /*
- * All component devices are integrity capable and have matching
- * profiles, register the common profile for the md device.
- */
- blk_integrity_register(mddev->gendisk,
- bdev_get_integrity(reference->bdev));
+ if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk))
+ return 0; /* shouldn't register */
pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
@@ -2459,32 +2376,6 @@ int md_integrity_register(struct mddev *mddev)
}
EXPORT_SYMBOL(md_integrity_register);
-/*
- * Attempt to add an rdev, but only if it is consistent with the current
- * integrity profile
- */
-int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
-{
- struct blk_integrity *bi_mddev;
-
- if (mddev_is_dm(mddev))
- return 0;
-
- bi_mddev = blk_get_integrity(mddev->gendisk);
-
- if (!bi_mddev) /* nothing to do */
- return 0;
-
- if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
- pr_err("%s: incompatible integrity profile for %pg\n",
- mdname(mddev), rdev->bdev);
- return -ENXIO;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(md_integrity_add_rdev);
-
static bool rdev_read_only(struct md_rdev *rdev)
{
return bdev_read_only(rdev->bdev) ||
@@ -2862,7 +2753,7 @@ repeat:
mddev_add_trace_msg(mddev, "md md_update_sb");
rewrite:
- md_bitmap_update_sb(mddev->bitmap);
+ mddev->bitmap_ops->update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) {
if (rdev->sb_loaded != 1)
continue; /* no noise on spare devices */
@@ -4184,6 +4075,34 @@ static struct md_sysfs_entry md_level =
__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
static ssize_t
+new_level_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d\n", mddev->new_level);
+}
+
+static ssize_t
+new_level_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned int n;
+ int err;
+
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+
+ mddev->new_level = n;
+ md_update_sb(mddev, 1);
+
+ mddev_unlock(mddev);
+ return len;
+}
+static struct md_sysfs_entry md_new_level =
+__ATTR(new_level, 0664, new_level_show, new_level_store);
+
+static ssize_t
layout_show(struct mddev *mddev, char *page)
{
/* just a number, not meaningful for all levels */
@@ -4722,17 +4641,23 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
while (*buf) {
chunk = end_chunk = simple_strtoul(buf, &end, 0);
- if (buf == end) break;
+ if (buf == end)
+ break;
+
if (*end == '-') { /* range */
buf = end + 1;
end_chunk = simple_strtoul(buf, &end, 0);
- if (buf == end) break;
+ if (buf == end)
+ break;
}
- if (*end && !isspace(*end)) break;
- md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
+
+ if (*end && !isspace(*end))
+ break;
+
+ mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk);
buf = skip_spaces(end);
}
- md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
+ mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */
out:
mddev_unlock(mddev);
return len;
@@ -4867,30 +4792,81 @@ out_unlock:
static struct md_sysfs_entry md_metadata =
__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
-static ssize_t
-action_show(struct mddev *mddev, char *page)
+enum sync_action md_sync_action(struct mddev *mddev)
{
- char *type = "idle";
unsigned long recovery = mddev->recovery;
+
+ /*
+ * frozen has the highest priority, means running sync_thread will be
+ * stopped immediately, and no new sync_thread can start.
+ */
if (test_bit(MD_RECOVERY_FROZEN, &recovery))
- type = "frozen";
- else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
- (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
- if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
- type = "reshape";
- else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
- if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
- type = "resync";
- else if (test_bit(MD_RECOVERY_CHECK, &recovery))
- type = "check";
- else
- type = "repair";
- } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
- type = "recover";
- else if (mddev->reshape_position != MaxSector)
- type = "reshape";
+ return ACTION_FROZEN;
+
+ /*
+ * read-only array can't register sync_thread, and it can only
+ * add/remove spares.
+ */
+ if (!md_is_rdwr(mddev))
+ return ACTION_IDLE;
+
+ /*
+ * idle means no sync_thread is running, and no new sync_thread is
+ * requested.
+ */
+ if (!test_bit(MD_RECOVERY_RUNNING, &recovery) &&
+ !test_bit(MD_RECOVERY_NEEDED, &recovery))
+ return ACTION_IDLE;
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
+ mddev->reshape_position != MaxSector)
+ return ACTION_RESHAPE;
+
+ if (test_bit(MD_RECOVERY_RECOVER, &recovery))
+ return ACTION_RECOVER;
+
+ if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
+ /*
+ * MD_RECOVERY_CHECK must be paired with
+ * MD_RECOVERY_REQUESTED.
+ */
+ if (test_bit(MD_RECOVERY_CHECK, &recovery))
+ return ACTION_CHECK;
+ if (test_bit(MD_RECOVERY_REQUESTED, &recovery))
+ return ACTION_REPAIR;
+ return ACTION_RESYNC;
}
- return sprintf(page, "%s\n", type);
+
+ /*
+ * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no
+ * sync_action is specified.
+ */
+ return ACTION_IDLE;
+}
+
+enum sync_action md_sync_action_by_name(const char *page)
+{
+ enum sync_action action;
+
+ for (action = 0; action < NR_SYNC_ACTIONS; ++action) {
+ if (cmd_match(page, action_name[action]))
+ return action;
+ }
+
+ return NR_SYNC_ACTIONS;
+}
+
+const char *md_sync_action_name(enum sync_action action)
+{
+ return action_name[action];
+}
+
+static ssize_t
+action_show(struct mddev *mddev, char *page)
+{
+ enum sync_action action = md_sync_action(mddev);
+
+ return sprintf(page, "%s\n", md_sync_action_name(action));
}
/**
@@ -4899,15 +4875,10 @@ action_show(struct mddev *mddev, char *page)
* @locked: if set, reconfig_mutex will still be held after this function
* return; if not set, reconfig_mutex will be released after this
* function return.
- * @check_seq: if set, only wait for curent running sync_thread to stop, noted
- * that new sync_thread can still start.
*/
-static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
+static void stop_sync_thread(struct mddev *mddev, bool locked)
{
- int sync_seq;
-
- if (check_seq)
- sync_seq = atomic_read(&mddev->sync_seq);
+ int sync_seq = atomic_read(&mddev->sync_seq);
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
if (!locked)
@@ -4928,7 +4899,8 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
wait_event(resync_wait,
!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- (check_seq && sync_seq != atomic_read(&mddev->sync_seq)));
+ (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) &&
+ sync_seq != atomic_read(&mddev->sync_seq)));
if (locked)
mddev_lock_nointr(mddev);
@@ -4939,7 +4911,7 @@ void md_idle_sync_thread(struct mddev *mddev)
lockdep_assert_held(&mddev->reconfig_mutex);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- stop_sync_thread(mddev, true, true);
+ stop_sync_thread(mddev, true);
}
EXPORT_SYMBOL_GPL(md_idle_sync_thread);
@@ -4948,7 +4920,7 @@ void md_frozen_sync_thread(struct mddev *mddev)
lockdep_assert_held(&mddev->reconfig_mutex);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- stop_sync_thread(mddev, true, false);
+ stop_sync_thread(mddev, true);
}
EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
@@ -4963,100 +4935,127 @@ void md_unfrozen_sync_thread(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
-static void idle_sync_thread(struct mddev *mddev)
+static int mddev_start_reshape(struct mddev *mddev)
{
- mutex_lock(&mddev->sync_mutex);
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-
- if (mddev_lock(mddev)) {
- mutex_unlock(&mddev->sync_mutex);
- return;
- }
-
- stop_sync_thread(mddev, false, true);
- mutex_unlock(&mddev->sync_mutex);
-}
+ int ret;
-static void frozen_sync_thread(struct mddev *mddev)
-{
- mutex_lock(&mddev->sync_mutex);
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ if (mddev->pers->start_reshape == NULL)
+ return -EINVAL;
- if (mddev_lock(mddev)) {
- mutex_unlock(&mddev->sync_mutex);
- return;
+ if (mddev->reshape_position == MaxSector ||
+ mddev->pers->check_reshape == NULL ||
+ mddev->pers->check_reshape(mddev)) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ ret = mddev->pers->start_reshape(mddev);
+ if (ret)
+ return ret;
+ } else {
+ /*
+ * If reshape is still in progress, and md_check_recovery() can
+ * continue to reshape, don't restart reshape because data can
+ * be corrupted for raid456.
+ */
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
- stop_sync_thread(mddev, false, false);
- mutex_unlock(&mddev->sync_mutex);
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
+ return 0;
}
static ssize_t
action_store(struct mddev *mddev, const char *page, size_t len)
{
+ int ret;
+ enum sync_action action;
+
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;
+retry:
+ if (work_busy(&mddev->sync_work))
+ flush_work(&mddev->sync_work);
- if (cmd_match(page, "idle"))
- idle_sync_thread(mddev);
- else if (cmd_match(page, "frozen"))
- frozen_sync_thread(mddev);
- else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- return -EBUSY;
- else if (cmd_match(page, "resync"))
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- else if (cmd_match(page, "recover")) {
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if (cmd_match(page, "reshape")) {
- int err;
- if (mddev->pers->start_reshape == NULL)
- return -EINVAL;
- err = mddev_lock(mddev);
- if (!err) {
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
- err = -EBUSY;
- } else if (mddev->reshape_position == MaxSector ||
- mddev->pers->check_reshape == NULL ||
- mddev->pers->check_reshape(mddev)) {
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- err = mddev->pers->start_reshape(mddev);
- } else {
- /*
- * If reshape is still in progress, and
- * md_check_recovery() can continue to reshape,
- * don't restart reshape because data can be
- * corrupted for raid456.
- */
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- }
- mddev_unlock(mddev);
+ ret = mddev_lock(mddev);
+ if (ret)
+ return ret;
+
+ if (work_busy(&mddev->sync_work)) {
+ mddev_unlock(mddev);
+ goto retry;
+ }
+
+ action = md_sync_action_by_name(page);
+
+ /* TODO: mdadm rely on "idle" to start sync_thread. */
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+ switch (action) {
+ case ACTION_FROZEN:
+ md_frozen_sync_thread(mddev);
+ ret = len;
+ goto out;
+ case ACTION_IDLE:
+ md_idle_sync_thread(mddev);
+ break;
+ case ACTION_RESHAPE:
+ case ACTION_RECOVER:
+ case ACTION_CHECK:
+ case ACTION_REPAIR:
+ case ACTION_RESYNC:
+ ret = -EBUSY;
+ goto out;
+ default:
+ ret = -EINVAL;
+ goto out;
}
- if (err)
- return err;
- sysfs_notify_dirent_safe(mddev->sysfs_degraded);
} else {
- if (cmd_match(page, "check"))
+ switch (action) {
+ case ACTION_FROZEN:
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ ret = len;
+ goto out;
+ case ACTION_RESHAPE:
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ ret = mddev_start_reshape(mddev);
+ if (ret)
+ goto out;
+ break;
+ case ACTION_RECOVER:
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ break;
+ case ACTION_CHECK:
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- else if (!cmd_match(page, "repair"))
- return -EINVAL;
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ fallthrough;
+ case ACTION_REPAIR:
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ fallthrough;
+ case ACTION_RESYNC:
+ case ACTION_IDLE:
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
}
+
if (mddev->ro == MD_AUTO_READ) {
/* A write to sync_action is enough to justify
* canceling read-auto mode
*/
- flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
md_wakeup_thread(mddev->sync_thread);
}
+
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
- return len;
+ ret = len;
+
+out:
+ mddev_unlock(mddev);
+ return ret;
}
static struct md_sysfs_entry md_scan_mode =
@@ -5065,7 +5064,8 @@ __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
static ssize_t
last_sync_action_show(struct mddev *mddev, char *page)
{
- return sprintf(page, "%s\n", mddev->last_sync_action);
+ return sprintf(page, "%s\n",
+ md_sync_action_name(mddev->last_sync_action));
}
static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
@@ -5633,6 +5633,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
static struct attribute *md_default_attrs[] = {
&md_level.attr,
+ &md_new_level.attr,
&md_layout.attr,
&md_raid_disks.attr,
&md_uuid.attr,
@@ -5755,14 +5756,20 @@ static const struct kobj_type md_ktype = {
int mdp_major = 0;
/* stack the limit for all rdevs into lim */
-void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim)
+int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
+ unsigned int flags)
{
struct md_rdev *rdev;
rdev_for_each(rdev, mddev) {
queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset,
mddev->gendisk->disk_name);
+ if ((flags & MDDEV_STACK_INTEGRITY) &&
+ !queue_limits_stack_integrity_bdev(lim, rdev->bdev))
+ return -EINVAL;
}
+
+ return 0;
}
EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
@@ -5777,6 +5784,14 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
lim = queue_limits_start_update(mddev->gendisk->queue);
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
mddev->gendisk->disk_name);
+
+ if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) {
+ pr_err("%s: incompatible integrity profile for %pg\n",
+ mdname(mddev), rdev->bdev);
+ queue_limits_cancel_update(mddev->gendisk->queue);
+ return -ENXIO;
+ }
+
return queue_limits_commit_update(mddev->gendisk->queue, &lim);
}
EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
@@ -5806,6 +5821,14 @@ static void mddev_delayed_delete(struct work_struct *ws)
kobject_put(&mddev->kobj);
}
+void md_init_stacking_limits(struct queue_limits *lim)
+{
+ blk_set_stacking_limits(lim);
+ lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA |
+ BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
+}
+EXPORT_SYMBOL_GPL(md_init_stacking_limits);
+
struct mddev *md_alloc(dev_t dev, char *name)
{
/*
@@ -5823,7 +5846,7 @@ struct mddev *md_alloc(dev_t dev, char *name)
int partitioned;
int shift;
int unit;
- int error ;
+ int error;
/*
* Wait for any previous instance of this device to be completely
@@ -5881,7 +5904,6 @@ struct mddev *md_alloc(dev_t dev, char *name)
disk->fops = &md_fops;
disk->private_data = mddev;
- blk_queue_write_cache(disk->queue, true, true);
disk->events |= DISK_EVENT_MEDIA_CHANGE;
mddev->gendisk = disk;
error = add_disk(disk);
@@ -6152,16 +6174,10 @@ int md_run(struct mddev *mddev)
}
if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
- struct bitmap *bitmap;
-
- bitmap = md_bitmap_create(mddev, -1);
- if (IS_ERR(bitmap)) {
- err = PTR_ERR(bitmap);
+ err = mddev->bitmap_ops->create(mddev, -1);
+ if (err)
pr_warn("%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
- } else
- mddev->bitmap = bitmap;
-
}
if (err)
goto bitmap_abort;
@@ -6185,28 +6201,6 @@ int md_run(struct mddev *mddev)
}
}
- if (!mddev_is_dm(mddev)) {
- struct request_queue *q = mddev->gendisk->queue;
- bool nonrot = true;
-
- rdev_for_each(rdev, mddev) {
- if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) {
- nonrot = false;
- break;
- }
- }
- if (mddev->degraded)
- nonrot = false;
- if (nonrot)
- blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
- else
- blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
- blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
-
- /* Set the NOWAIT flags if all underlying devices support it */
- if (nowait)
- blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
- }
if (pers->sync_request) {
if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &md_redundancy_group))
@@ -6253,7 +6247,7 @@ bitmap_abort:
pers->free(mddev, mddev->private);
mddev->private = NULL;
module_put(pers->owner);
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
abort:
bioset_exit(&mddev->io_clone_set);
exit_sync_set:
@@ -6272,9 +6266,10 @@ int do_md_run(struct mddev *mddev)
err = md_run(mddev);
if (err)
goto out;
- err = md_bitmap_load(mddev);
+
+ err = mddev->bitmap_ops->load(mddev);
if (err) {
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
goto out;
}
@@ -6418,7 +6413,8 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
- md_bitmap_flush(mddev);
+
+ mddev->bitmap_ops->flush(mddev);
if (md_is_rdwr(mddev) &&
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
@@ -6437,7 +6433,7 @@ void md_stop_writes(struct mddev *mddev)
{
mddev_lock_nointr(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- stop_sync_thread(mddev, true, false);
+ stop_sync_thread(mddev, true);
__md_stop_writes(mddev);
mddev_unlock(mddev);
}
@@ -6445,7 +6441,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
static void mddev_detach(struct mddev *mddev)
{
- md_bitmap_wait_behind_writes(mddev);
+ mddev->bitmap_ops->wait_behind_writes(mddev);
if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
@@ -6460,7 +6456,8 @@ static void mddev_detach(struct mddev *mddev)
static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
- md_bitmap_destroy(mddev);
+
+ mddev->bitmap_ops->destroy(mddev);
mddev_detach(mddev);
spin_lock(&mddev->lock);
mddev->pers = NULL;
@@ -6505,7 +6502,7 @@ static int md_set_readonly(struct mddev *mddev)
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
- stop_sync_thread(mddev, false, false);
+ stop_sync_thread(mddev, false);
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_lock_nointr(mddev);
@@ -6551,7 +6548,7 @@ static int do_md_stop(struct mddev *mddev, int mode)
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
- stop_sync_thread(mddev, true, false);
+ stop_sync_thread(mddev, true);
if (mddev->sysfs_active ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
@@ -7166,15 +7163,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
if (!mddev->thread)
md_update_sb(mddev, 1);
/*
- * If the new disk does not support REQ_NOWAIT,
- * disable on the whole MD.
- */
- if (!bdev_nowait(rdev->bdev)) {
- pr_info("%s: Disabling nowait because %pg does not support nowait\n",
- mdname(mddev), rdev->bdev);
- blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue);
- }
- /*
* Kick recovery, maybe this spare has to be added to the
* array immediately.
*/
@@ -7247,22 +7235,19 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
err = 0;
if (mddev->pers) {
if (fd >= 0) {
- struct bitmap *bitmap;
+ err = mddev->bitmap_ops->create(mddev, -1);
+ if (!err)
+ err = mddev->bitmap_ops->load(mddev);
- bitmap = md_bitmap_create(mddev, -1);
- if (!IS_ERR(bitmap)) {
- mddev->bitmap = bitmap;
- err = md_bitmap_load(mddev);
- } else
- err = PTR_ERR(bitmap);
if (err) {
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
fd = -1;
}
} else if (fd < 0) {
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
}
}
+
if (fd < 0) {
struct file *f = mddev->bitmap_info.file;
if (f) {
@@ -7531,7 +7516,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
goto err;
}
if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
- struct bitmap *bitmap;
/* add the bitmap */
if (mddev->bitmap) {
rv = -EEXIST;
@@ -7545,24 +7529,24 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
- bitmap = md_bitmap_create(mddev, -1);
- if (!IS_ERR(bitmap)) {
- mddev->bitmap = bitmap;
- rv = md_bitmap_load(mddev);
- } else
- rv = PTR_ERR(bitmap);
+ rv = mddev->bitmap_ops->create(mddev, -1);
+ if (!rv)
+ rv = mddev->bitmap_ops->load(mddev);
+
if (rv)
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
} else {
- /* remove the bitmap */
- if (!mddev->bitmap) {
- rv = -ENOENT;
+ struct md_bitmap_stats stats;
+
+ rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (rv)
goto err;
- }
- if (mddev->bitmap->storage.file) {
+
+ if (stats.file) {
rv = -EINVAL;
goto err;
}
+
if (mddev->bitmap_info.nodes) {
/* hold PW on all the bitmap lock */
if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
@@ -7577,7 +7561,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
module_put(md_cluster_mod);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
mddev->bitmap_info.offset = 0;
}
}
@@ -7742,12 +7726,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
return get_bitmap_file(mddev, argp);
}
- if (cmd == HOT_REMOVE_DISK)
- /* need to ensure recovery thread has run */
- wait_event_interruptible_timeout(mddev->sb_wait,
- !test_bit(MD_RECOVERY_NEEDED,
- &mddev->recovery),
- msecs_to_jiffies(5000));
if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
/* Need to flush page cache, and ensure no-one else opens
* and writes
@@ -8087,7 +8065,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread)
if (t) {
pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
set_bit(THREAD_WAKEUP, &t->flags);
- wake_up(&t->wqueue);
+ if (wq_has_sleeper(&t->wqueue))
+ wake_up(&t->wqueue);
}
rcu_read_unlock();
}
@@ -8145,7 +8124,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
return;
mddev->pers->error_handler(mddev, rdev);
- if (mddev->pers->level == 0)
+ if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR)
return;
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
@@ -8352,6 +8331,33 @@ static void md_seq_stop(struct seq_file *seq, void *v)
spin_unlock(&all_mddevs_lock);
}
+static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
+{
+ struct md_bitmap_stats stats;
+ unsigned long used_pages;
+ unsigned long chunk_kb;
+ int err;
+
+ err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (err)
+ return;
+
+ chunk_kb = mddev->bitmap_info.chunksize >> 10;
+ used_pages = stats.pages - stats.missing_pages;
+
+ seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk",
+ used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10),
+ chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
+ chunk_kb ? "KB" : "B");
+
+ if (stats.file) {
+ seq_puts(seq, ", file: ");
+ seq_file_path(seq, stats.file, " \t\n");
+ }
+
+ seq_putc(seq, '\n');
+}
+
static int md_seq_show(struct seq_file *seq, void *v)
{
struct mddev *mddev;
@@ -8370,16 +8376,25 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
spin_unlock(&all_mddevs_lock);
+
+ /* prevent bitmap to be freed after checking */
+ mutex_lock(&mddev->bitmap_info.mutex);
+
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
- seq_printf(seq, "%s : %sactive", mdname(mddev),
- mddev->pers ? "" : "in");
+ seq_printf(seq, "%s : ", mdname(mddev));
if (mddev->pers) {
+ if (test_bit(MD_BROKEN, &mddev->flags))
+ seq_printf(seq, "broken");
+ else
+ seq_printf(seq, "active");
if (mddev->ro == MD_RDONLY)
seq_printf(seq, " (read-only)");
if (mddev->ro == MD_AUTO_READ)
seq_printf(seq, " (auto-read-only)");
seq_printf(seq, " %s", mddev->pers->name);
+ } else {
+ seq_printf(seq, "inactive");
}
sectors = 0;
@@ -8435,11 +8450,12 @@ static int md_seq_show(struct seq_file *seq, void *v)
} else
seq_printf(seq, "\n ");
- md_bitmap_status(seq, mddev->bitmap);
+ md_bitmap_status(seq, mddev);
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ mutex_unlock(&mddev->bitmap_info.mutex);
spin_lock(&all_mddevs_lock);
if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
@@ -8519,7 +8535,7 @@ int unregister_md_personality(struct md_personality *p)
}
EXPORT_SYMBOL(unregister_md_personality);
-int register_md_cluster_operations(struct md_cluster_operations *ops,
+int register_md_cluster_operations(const struct md_cluster_operations *ops,
struct module *module)
{
int ret = 0;
@@ -8582,6 +8598,10 @@ static int is_mddev_idle(struct mddev *mddev, int init)
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_disk;
+
+ if (!init && !blk_queue_io_stat(disk->queue))
+ continue;
+
curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
atomic_read(&disk->sync_io);
/* sync IO will cause sync_io to increase before the disk_stats
@@ -8636,17 +8656,16 @@ EXPORT_SYMBOL(md_done_sync);
* A return value of 'false' means that the write wasn't recorded
* and cannot proceed as the array is being suspend.
*/
-bool md_write_start(struct mddev *mddev, struct bio *bi)
+void md_write_start(struct mddev *mddev, struct bio *bi)
{
int did_change = 0;
if (bio_data_dir(bi) != WRITE)
- return true;
+ return;
BUG_ON(mddev->ro == MD_RDONLY);
if (mddev->ro == MD_AUTO_READ) {
/* need to switch to read/write */
- flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -8674,15 +8693,9 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
if (!mddev->has_superblocks)
- return true;
+ return;
wait_event(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
- is_md_suspended(mddev));
- if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
- percpu_ref_put(&mddev->writes_pending);
- return false;
- }
- return true;
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
}
EXPORT_SYMBOL(md_write_start);
@@ -8737,12 +8750,32 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
+static void md_bitmap_start(struct mddev *mddev,
+ struct md_io_clone *md_io_clone)
+{
+ if (mddev->pers->bitmap_sector)
+ mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
+ &md_io_clone->sectors);
+
+ mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset,
+ md_io_clone->sectors);
+}
+
+static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
+{
+ mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset,
+ md_io_clone->sectors);
+}
+
static void md_end_clone_io(struct bio *bio)
{
struct md_io_clone *md_io_clone = bio->bi_private;
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
+ if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ md_bitmap_end(mddev, md_io_clone);
+
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
@@ -8767,6 +8800,12 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
if (blk_queue_io_stat(bdev->bd_disk->queue))
md_io_clone->start_time = bio_start_io_acct(*bio);
+ if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
+ md_io_clone->offset = (*bio)->bi_iter.bi_sector;
+ md_io_clone->sectors = bio_sectors(*bio);
+ md_bitmap_start(mddev, md_io_clone);
+ }
+
clone->bi_end_io = md_end_clone_io;
clone->bi_private = md_io_clone;
*bio = clone;
@@ -8785,6 +8824,9 @@ void md_free_cloned_bio(struct bio *bio)
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
+ if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ md_bitmap_end(mddev, md_io_clone);
+
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
@@ -8830,6 +8872,77 @@ void md_allow_write(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(md_allow_write);
+static sector_t md_sync_max_sectors(struct mddev *mddev,
+ enum sync_action action)
+{
+ switch (action) {
+ case ACTION_RESYNC:
+ case ACTION_CHECK:
+ case ACTION_REPAIR:
+ atomic64_set(&mddev->resync_mismatches, 0);
+ fallthrough;
+ case ACTION_RESHAPE:
+ return mddev->resync_max_sectors;
+ case ACTION_RECOVER:
+ return mddev->dev_sectors;
+ default:
+ return 0;
+ }
+}
+
+static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
+{
+ sector_t start = 0;
+ struct md_rdev *rdev;
+
+ switch (action) {
+ case ACTION_CHECK:
+ case ACTION_REPAIR:
+ return mddev->resync_min;
+ case ACTION_RESYNC:
+ if (!mddev->bitmap)
+ return mddev->recovery_cp;
+ return 0;
+ case ACTION_RESHAPE:
+ /*
+ * If the original node aborts reshaping then we continue the
+ * reshaping, so set again to avoid restart reshape from the
+ * first beginning
+ */
+ if (mddev_is_clustered(mddev) &&
+ mddev->reshape_position != MaxSector)
+ return mddev->reshape_position;
+ return 0;
+ case ACTION_RECOVER:
+ start = MaxSector;
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < start)
+ start = rdev->recovery_offset;
+ rcu_read_unlock();
+
+ /* If there is a bitmap, we need to make sure all
+ * writes that started before we added a spare
+ * complete before we start doing a recovery.
+ * Otherwise the write might complete and (via
+ * bitmap_endwrite) set a bit in the bitmap after the
+ * recovery has checked that bit and skipped that
+ * region.
+ */
+ if (mddev->bitmap) {
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ return start;
+ default:
+ return MaxSector;
+ }
+}
+
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
#define UPDATE_FREQUENCY (5*60*HZ)
@@ -8846,7 +8959,8 @@ void md_do_sync(struct md_thread *thread)
sector_t last_check;
int skipped = 0;
struct md_rdev *rdev;
- char *desc, *action = NULL;
+ enum sync_action action;
+ const char *desc;
struct blk_plug plug;
int ret;
@@ -8877,21 +8991,9 @@ void md_do_sync(struct md_thread *thread)
goto skip;
}
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
- if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
- desc = "data-check";
- action = "check";
- } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
- desc = "requested-resync";
- action = "repair";
- } else
- desc = "resync";
- } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- desc = "reshape";
- else
- desc = "recovery";
-
- mddev->last_sync_action = action ?: desc;
+ action = md_sync_action(mddev);
+ desc = md_sync_action_name(action);
+ mddev->last_sync_action = action;
/*
* Before starting a resync we must have set curr_resync to
@@ -8902,7 +9004,8 @@ void md_do_sync(struct md_thread *thread)
* This will mean we have to start checking from the beginning again.
*
*/
-
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->resync_start_notify(mddev);
do {
int mddev2_minor = -1;
mddev->curr_resync = MD_RESYNC_DELAYED;
@@ -8959,56 +9062,8 @@ void md_do_sync(struct md_thread *thread)
spin_unlock(&all_mddevs_lock);
} while (mddev->curr_resync < MD_RESYNC_DELAYED);
- j = 0;
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
- /* resync follows the size requested by the personality,
- * which defaults to physical size, but can be virtual size
- */
- max_sectors = mddev->resync_max_sectors;
- atomic64_set(&mddev->resync_mismatches, 0);
- /* we don't use the checkpoint if there's a bitmap */
- if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
- j = mddev->resync_min;
- else if (!mddev->bitmap)
- j = mddev->recovery_cp;
-
- } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
- max_sectors = mddev->resync_max_sectors;
- /*
- * If the original node aborts reshaping then we continue the
- * reshaping, so set j again to avoid restart reshape from the
- * first beginning
- */
- if (mddev_is_clustered(mddev) &&
- mddev->reshape_position != MaxSector)
- j = mddev->reshape_position;
- } else {
- /* recovery follows the physical size of devices */
- max_sectors = mddev->dev_sectors;
- j = MaxSector;
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev)
- if (rdev->raid_disk >= 0 &&
- !test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags) &&
- !test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset < j)
- j = rdev->recovery_offset;
- rcu_read_unlock();
-
- /* If there is a bitmap, we need to make sure all
- * writes that started before we added a spare
- * complete before we start doing a recovery.
- * Otherwise the write might complete and (via
- * bitmap_endwrite) set a bit in the bitmap after the
- * recovery has checked that bit and skipped that
- * region.
- */
- if (mddev->bitmap) {
- mddev->pers->quiesce(mddev, 1);
- mddev->pers->quiesce(mddev, 0);
- }
- }
+ max_sectors = md_sync_max_sectors(mddev, action);
+ j = md_sync_position(mddev, action);
pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
@@ -9090,7 +9145,8 @@ void md_do_sync(struct md_thread *thread)
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
- sectors = mddev->pers->sync_request(mddev, j, &skipped);
+ sectors = mddev->pers->sync_request(mddev, j, max_sectors,
+ &skipped);
if (sectors == 0) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
break;
@@ -9180,7 +9236,7 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
- mddev->pers->sync_request(mddev, max_sectors, &skipped);
+ mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > MD_RESYNC_ACTIVE) {
@@ -9476,7 +9532,7 @@ static void md_start_sync(struct work_struct *ws)
* stored on all devices. So make sure all bitmap pages get written.
*/
if (spares)
- md_bitmap_write_all(mddev->bitmap);
+ mddev->bitmap_ops->write_all(mddev);
name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
"reshape" : "resync";
@@ -9564,7 +9620,7 @@ static void unregister_sync_thread(struct mddev *mddev)
void md_check_recovery(struct mddev *mddev)
{
if (mddev->bitmap)
- md_bitmap_daemon_work(mddev);
+ mddev->bitmap_ops->daemon_work(mddev);
if (signal_pending(current)) {
if (mddev->pers->sync_request && !mddev->external) {
@@ -9762,9 +9818,7 @@ EXPORT_SYMBOL(md_reap_sync_thread);
void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
{
sysfs_notify_dirent_safe(rdev->sysfs_state);
- wait_event_timeout(rdev->blocked_wait,
- !test_bit(Blocked, &rdev->flags) &&
- !test_bit(BlockedBadBlocks, &rdev->flags),
+ wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev),
msecs_to_jiffies(5000));
rdev_dec_pending(rdev, mddev);
}
@@ -9793,6 +9847,17 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
{
struct mddev *mddev = rdev->mddev;
int rv;
+
+ /*
+ * Recording new badblocks for faulty rdev will force unnecessary
+ * super block updating. This is fragile for external management because
+ * userspace daemon may trying to remove this device and deadlock may
+ * occur. This will be probably solved in the mdadm, but it is safer to
+ * avoid it.
+ */
+ if (test_bit(Faulty, &rdev->flags))
+ return 1;
+
if (is_new)
s += rdev->new_data_offset;
else
@@ -9935,7 +10000,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
if (ret)
pr_info("md-cluster: resize failed\n");
else
- md_bitmap_update_sb(mddev->bitmap);
+ mddev->bitmap_ops->update_sb(mddev->bitmap);
}
/* Check for change of roles in the active devices */
@@ -9963,8 +10028,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
*/
if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
!(le32_to_cpu(sb->feature_map) &
- MD_FEATURE_RESHAPE_ACTIVE)) {
- rdev2->saved_raid_disk = role;
+ MD_FEATURE_RESHAPE_ACTIVE) &&
+ !md_cluster_ops->resync_status_get(mddev)) {
+ /*
+ * -1 to make raid1_add_disk() set conf->fullsync
+ * to 1. This could avoid skipping sync when the
+ * remote node is down during resyncing.
+ */
+ if ((le32_to_cpu(sb->feature_map)
+ & MD_FEATURE_RECOVERY_OFFSET))
+ rdev2->saved_raid_disk = -1;
+ else
+ rdev2->saved_raid_disk = role;
ret = remove_and_add_spares(mddev, rdev2);
pr_info("Activated spare: %pg\n",
rdev2->bdev);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 097d9dbd69b8..def808064ad8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -34,6 +34,61 @@
*/
#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
+/* Status of sync thread. */
+enum sync_action {
+ /*
+ * Represent by MD_RECOVERY_SYNC, start when:
+ * 1) after assemble, sync data from first rdev to other copies, this
+ * must be done first before other sync actions and will only execute
+ * once;
+ * 2) resize the array(notice that this is not reshape), sync data for
+ * the new range;
+ */
+ ACTION_RESYNC,
+ /*
+ * Represent by MD_RECOVERY_RECOVER, start when:
+ * 1) for new replacement, sync data based on the replace rdev or
+ * available copies from other rdev;
+ * 2) for new member disk while the array is degraded, sync data from
+ * other rdev;
+ * 3) reassemble after power failure or re-add a hot removed rdev, sync
+ * data from first rdev to other copies based on bitmap;
+ */
+ ACTION_RECOVER,
+ /*
+ * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED |
+ * MD_RECOVERY_CHECK, start when user echo "check" to sysfs api
+ * sync_action, used to check if data copies from differenct rdev are
+ * the same. The number of mismatch sectors will be exported to user
+ * by sysfs api mismatch_cnt;
+ */
+ ACTION_CHECK,
+ /*
+ * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED, start when
+ * user echo "repair" to sysfs api sync_action, usually paired with
+ * ACTION_CHECK, used to force syncing data once user found that there
+ * are inconsistent data,
+ */
+ ACTION_REPAIR,
+ /*
+ * Represent by MD_RECOVERY_RESHAPE, start when new member disk is added
+ * to the conf, notice that this is different from spares or
+ * replacement;
+ */
+ ACTION_RESHAPE,
+ /*
+ * Represent by MD_RECOVERY_FROZEN, can be set by sysfs api sync_action
+ * or internal usage like setting the array read-only, will forbid above
+ * actions.
+ */
+ ACTION_FROZEN,
+ /*
+ * All above actions don't match.
+ */
+ ACTION_IDLE,
+ NR_SYNC_ACTIONS,
+};
+
/*
* The struct embedded in rdev is used to serialize IO.
*/
@@ -371,13 +426,12 @@ struct mddev {
struct md_thread __rcu *thread; /* management thread */
struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */
- /* 'last_sync_action' is initialized to "none". It is set when a
- * sync operation (i.e "data-check", "requested-resync", "resync",
- * "recovery", or "reshape") is started. It holds this value even
+ /*
+ * Set when a sync operation is started. It holds this value even
* when the sync thread is "frozen" (interrupted) or "idle" (stopped
- * or finished). It is overwritten when a new sync operation is begun.
+ * or finished). It is overwritten when a new sync operation is begun.
*/
- char *last_sync_action;
+ enum sync_action last_sync_action;
sector_t curr_resync; /* last block scheduled */
/* As resync requests can complete out of order, we cannot easily track
* how much resync has been completed. So we occasionally pause until
@@ -481,7 +535,8 @@ struct mddev {
struct percpu_ref writes_pending;
int sync_checkers; /* # of threads checking writes_pending */
- struct bitmap *bitmap; /* the bitmap for the device */
+ void *bitmap; /* the bitmap for the device */
+ struct bitmap_operations *bitmap_ops;
struct {
struct file *file; /* the bitmap file */
loff_t offset; /* offset from superblock of
@@ -517,16 +572,6 @@ struct mddev {
*/
struct bio_set io_clone_set;
- /* Generic flush handling.
- * The last to finish preflush schedules a worker to submit
- * the rest of the request (without the REQ_PREFLUSH flag).
- */
- struct bio *flush_bio;
- atomic_t flush_pending;
- ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed
- * flush was started.
- */
- struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
mempool_t *serial_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
@@ -540,8 +585,6 @@ struct mddev {
*/
struct list_head deleting;
- /* Used to synchronize idle and frozen for action_store() */
- struct mutex sync_mutex;
/* The sequence number for sync thread */
atomic_t sync_seq;
@@ -551,22 +594,46 @@ struct mddev {
};
enum recovery_flags {
+ /* flags for sync thread running status */
+
/*
- * If neither SYNC or RESHAPE are set, then it is a recovery.
+ * set when one of sync action is set and new sync thread need to be
+ * registered, or just add/remove spares from conf.
*/
- MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */
- MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */
- MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */
- MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */
- MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */
- MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */
- MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */
- MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */
- MD_RECOVERY_RESHAPE, /* A reshape is happening */
- MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */
- MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */
- MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */
- MD_RESYNCING_REMOTE, /* remote node is running resync thread */
+ MD_RECOVERY_NEEDED,
+ /* sync thread is running, or about to be started */
+ MD_RECOVERY_RUNNING,
+ /* sync thread needs to be aborted for some reason */
+ MD_RECOVERY_INTR,
+ /* sync thread is done and is waiting to be unregistered */
+ MD_RECOVERY_DONE,
+ /* running sync thread must abort immediately, and not restart */
+ MD_RECOVERY_FROZEN,
+ /* waiting for pers->start() to finish */
+ MD_RECOVERY_WAIT,
+ /* interrupted because io-error */
+ MD_RECOVERY_ERROR,
+
+ /* flags determines sync action, see details in enum sync_action */
+
+ /* if just this flag is set, action is resync. */
+ MD_RECOVERY_SYNC,
+ /*
+ * paired with MD_RECOVERY_SYNC, if MD_RECOVERY_CHECK is not set,
+ * action is repair, means user requested resync.
+ */
+ MD_RECOVERY_REQUESTED,
+ /*
+ * paired with MD_RECOVERY_SYNC and MD_RECOVERY_REQUESTED, action is
+ * check.
+ */
+ MD_RECOVERY_CHECK,
+ /* recovery, or need to try it */
+ MD_RECOVERY_RECOVER,
+ /* reshape */
+ MD_RECOVERY_RESHAPE,
+ /* remote node is running resync thread */
+ MD_RESYNCING_REMOTE,
};
enum md_ro_state {
@@ -621,7 +688,8 @@ extern void mddev_unlock(struct mddev *mddev);
static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
{
- atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
+ if (blk_queue_io_stat(bdev->bd_disk->queue))
+ atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
}
static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
@@ -652,7 +720,8 @@ struct md_personality
int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*spare_active) (struct mddev *mddev);
- sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped);
+ sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr,
+ sector_t max_sector, int *skipped);
int (*resize) (struct mddev *mddev, sector_t sectors);
sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks);
int (*check_reshape) (struct mddev *mddev);
@@ -677,6 +746,9 @@ struct md_personality
void *(*takeover) (struct mddev *mddev);
/* Changes the consistency policy of an active array. */
int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
+ /* convert io ranges from array to bitmap */
+ void (*bitmap_sector)(struct mddev *mddev, sector_t *offset,
+ unsigned long *sectors);
};
struct md_sysfs_entry {
@@ -759,6 +831,8 @@ struct md_io_clone {
struct mddev *mddev;
struct bio *orig_bio;
unsigned long start_time;
+ sector_t offset;
+ unsigned long sectors;
struct bio bio_clone;
};
@@ -771,7 +845,7 @@ static inline void safe_put_page(struct page *p)
extern int register_md_personality(struct md_personality *p);
extern int unregister_md_personality(struct md_personality *p);
-extern int register_md_cluster_operations(struct md_cluster_operations *ops,
+extern int register_md_cluster_operations(const struct md_cluster_operations *ops,
struct module *module);
extern int unregister_md_cluster_operations(void);
extern int md_setup_cluster(struct mddev *mddev, int nodes);
@@ -784,7 +858,10 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t
extern void md_wakeup_thread(struct md_thread __rcu *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
-extern bool md_write_start(struct mddev *mddev, struct bio *bi);
+extern enum sync_action md_sync_action(struct mddev *mddev);
+extern enum sync_action md_sync_action_by_name(const char *page);
+extern const char *md_sync_action_name(enum sync_action action);
+extern void md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
@@ -808,11 +885,11 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
extern int md_check_no_bitmap(struct mddev *mddev);
extern int md_integrity_register(struct mddev *mddev);
-extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
extern int mddev_init(struct mddev *mddev);
extern void mddev_destroy(struct mddev *mddev);
+void md_init_stacking_limits(struct queue_limits *lim);
struct mddev *md_alloc(dev_t dev, char *name);
void mddev_put(struct mddev *mddev);
extern int md_run(struct mddev *mddev);
@@ -851,7 +928,7 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
}
}
-extern struct md_cluster_operations *md_cluster_ops;
+extern const struct md_cluster_operations *md_cluster_ops;
static inline int mddev_is_clustered(struct mddev *mddev)
{
return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
@@ -907,7 +984,9 @@ void md_autostart_arrays(int part);
int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
int do_md_run(struct mddev *mddev);
-void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim);
+#define MDDEV_STACK_INTEGRITY (1u << 0)
+int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
+ unsigned int flags);
int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev);
void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes);
@@ -928,6 +1007,30 @@ static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio,
trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector);
}
+static inline bool rdev_blocked(struct md_rdev *rdev)
+{
+ /*
+ * Blocked will be set by error handler and cleared by daemon after
+ * updating superblock, meanwhile write IO should be blocked to prevent
+ * reading old data after power failure.
+ */
+ if (test_bit(Blocked, &rdev->flags))
+ return true;
+
+ /*
+ * Faulty device should not be accessed anymore, there is no need to
+ * wait for bad block to be acknowledged.
+ */
+ if (test_bit(Faulty, &rdev->flags))
+ return false;
+
+ /* rdev is blocked by badblocks. */
+ if (test_bit(BlockedBadBlocks, &rdev->flags))
+ return true;
+
+ return false;
+}
+
#define mddev_add_trace_msg(mddev, fmt, args...) \
do { \
if (!mddev_is_dm(mddev)) \
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 798c9c53a343..8f8792e55806 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -38,7 +38,7 @@ struct array_block {
*/
#define CSUM_XOR 595846735
-static void array_block_prepare_for_write(struct dm_block_validator *v,
+static void array_block_prepare_for_write(const struct dm_block_validator *v,
struct dm_block *b,
size_t size_of_block)
{
@@ -50,7 +50,7 @@ static void array_block_prepare_for_write(struct dm_block_validator *v,
CSUM_XOR));
}
-static int array_block_check(struct dm_block_validator *v,
+static int array_block_check(const struct dm_block_validator *v,
struct dm_block *b,
size_t size_of_block)
{
@@ -77,7 +77,7 @@ static int array_block_check(struct dm_block_validator *v,
return 0;
}
-static struct dm_block_validator array_validator = {
+static const struct dm_block_validator array_validator = {
.name = "array",
.prepare_for_write = array_block_prepare_for_write,
.check = array_block_check
@@ -917,23 +917,27 @@ static int load_ablock(struct dm_array_cursor *c)
if (c->block)
unlock_ablock(c->info, c->block);
- c->block = NULL;
- c->ab = NULL;
c->index = 0;
r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le);
if (r) {
DMERR("dm_btree_cursor_get_value failed");
- dm_btree_cursor_end(&c->cursor);
+ goto out;
} else {
r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab);
if (r) {
DMERR("get_ablock failed");
- dm_btree_cursor_end(&c->cursor);
+ goto out;
}
}
+ return 0;
+
+out:
+ dm_btree_cursor_end(&c->cursor);
+ c->block = NULL;
+ c->ab = NULL;
return r;
}
@@ -956,10 +960,10 @@ EXPORT_SYMBOL_GPL(dm_array_cursor_begin);
void dm_array_cursor_end(struct dm_array_cursor *c)
{
- if (c->block) {
+ if (c->block)
unlock_ablock(c->info, c->block);
- dm_btree_cursor_end(&c->cursor);
- }
+
+ dm_btree_cursor_end(&c->cursor);
}
EXPORT_SYMBOL_GPL(dm_array_cursor_end);
@@ -999,6 +1003,7 @@ int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count)
}
count -= remaining;
+ c->index += (remaining - 1);
r = dm_array_cursor_next(c);
} while (!r);
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index b17b54df673b..1ef71e5fcde7 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -345,7 +345,7 @@ void *dm_block_data(struct dm_block *b)
EXPORT_SYMBOL_GPL(dm_block_data);
struct buffer_aux {
- struct dm_block_validator *validator;
+ const struct dm_block_validator *validator;
int write_locked;
#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING
@@ -441,7 +441,7 @@ dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
static int dm_bm_validate_buffer(struct dm_block_manager *bm,
struct dm_buffer *buf,
struct buffer_aux *aux,
- struct dm_block_validator *v)
+ const struct dm_block_validator *v)
{
if (unlikely(!aux->validator)) {
int r;
@@ -467,7 +467,7 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm,
return 0;
}
int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
- struct dm_block_validator *v,
+ const struct dm_block_validator *v,
struct dm_block **result)
{
struct buffer_aux *aux;
@@ -500,7 +500,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
EXPORT_SYMBOL_GPL(dm_bm_read_lock);
int dm_bm_write_lock(struct dm_block_manager *bm,
- dm_block_t b, struct dm_block_validator *v,
+ dm_block_t b, const struct dm_block_validator *v,
struct dm_block **result)
{
struct buffer_aux *aux;
@@ -536,7 +536,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm,
EXPORT_SYMBOL_GPL(dm_bm_write_lock);
int dm_bm_read_try_lock(struct dm_block_manager *bm,
- dm_block_t b, struct dm_block_validator *v,
+ dm_block_t b, const struct dm_block_validator *v,
struct dm_block **result)
{
struct buffer_aux *aux;
@@ -569,7 +569,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm,
}
int dm_bm_write_lock_zero(struct dm_block_manager *bm,
- dm_block_t b, struct dm_block_validator *v,
+ dm_block_t b, const struct dm_block_validator *v,
struct dm_block **result)
{
int r;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index f706d3de8d5a..b1998968594c 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -51,12 +51,14 @@ dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm);
*/
struct dm_block_validator {
const char *name;
- void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
+ void (*prepare_for_write)(const struct dm_block_validator *v,
+ struct dm_block *b, size_t block_size);
/*
* Return 0 if the checksum is valid or < 0 on error.
*/
- int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
+ int (*check)(const struct dm_block_validator *v,
+ struct dm_block *b, size_t block_size);
};
/*----------------------------------------------------------------*/
@@ -73,11 +75,11 @@ struct dm_block_validator {
* written back to the disk sometime after dm_bm_unlock is called.
*/
int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
- struct dm_block_validator *v,
+ const struct dm_block_validator *v,
struct dm_block **result);
int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b,
- struct dm_block_validator *v,
+ const struct dm_block_validator *v,
struct dm_block **result);
/*
@@ -85,7 +87,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b,
* available immediately.
*/
int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b,
- struct dm_block_validator *v,
+ const struct dm_block_validator *v,
struct dm_block **result);
/*
@@ -93,7 +95,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b,
* overwrite the block completely. It saves a disk read.
*/
int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
- struct dm_block_validator *v,
+ const struct dm_block_validator *v,
struct dm_block **result);
void dm_bm_unlock(struct dm_block *b);
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index 7ed2ce656fcc..acebd32858a7 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -138,7 +138,7 @@ static inline uint64_t value64(struct btree_node *n, uint32_t index)
*/
int lower_bound(struct btree_node *n, uint64_t key);
-extern struct dm_block_validator btree_node_validator;
+extern const struct dm_block_validator btree_node_validator;
/*
* Value type for upper levels of multi-level btrees.
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index 7540383b7cf3..c46fc50c274e 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -16,7 +16,7 @@
#define BTREE_CSUM_XOR 121107
-static void node_prepare_for_write(struct dm_block_validator *v,
+static void node_prepare_for_write(const struct dm_block_validator *v,
struct dm_block *b,
size_t block_size)
{
@@ -29,7 +29,7 @@ static void node_prepare_for_write(struct dm_block_validator *v,
BTREE_CSUM_XOR));
}
-static int node_check(struct dm_block_validator *v,
+static int node_check(const struct dm_block_validator *v,
struct dm_block *b,
size_t block_size)
{
@@ -81,7 +81,7 @@ static int node_check(struct dm_block_validator *v,
return 0;
}
-struct dm_block_validator btree_node_validator = {
+const struct dm_block_validator btree_node_validator = {
.name = "btree_node",
.prepare_for_write = node_prepare_for_write,
.check = node_check
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 591d1a43d035..22a551c407da 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -22,7 +22,7 @@
*/
#define INDEX_CSUM_XOR 160478
-static void index_prepare_for_write(struct dm_block_validator *v,
+static void index_prepare_for_write(const struct dm_block_validator *v,
struct dm_block *b,
size_t block_size)
{
@@ -34,7 +34,7 @@ static void index_prepare_for_write(struct dm_block_validator *v,
INDEX_CSUM_XOR));
}
-static int index_check(struct dm_block_validator *v,
+static int index_check(const struct dm_block_validator *v,
struct dm_block *b,
size_t block_size)
{
@@ -51,7 +51,7 @@ static int index_check(struct dm_block_validator *v,
block_size - sizeof(__le32),
INDEX_CSUM_XOR));
if (csum_disk != mi_le->csum) {
- DMERR_LIMIT("i%s failed: csum %u != wanted %u", __func__,
+ DMERR_LIMIT("%s failed: csum %u != wanted %u", __func__,
le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
return -EILSEQ;
}
@@ -59,7 +59,7 @@ static int index_check(struct dm_block_validator *v,
return 0;
}
-static struct dm_block_validator index_validator = {
+static const struct dm_block_validator index_validator = {
.name = "index",
.prepare_for_write = index_prepare_for_write,
.check = index_check
@@ -72,7 +72,7 @@ static struct dm_block_validator index_validator = {
*/
#define BITMAP_CSUM_XOR 240779
-static void dm_bitmap_prepare_for_write(struct dm_block_validator *v,
+static void dm_bitmap_prepare_for_write(const struct dm_block_validator *v,
struct dm_block *b,
size_t block_size)
{
@@ -84,7 +84,7 @@ static void dm_bitmap_prepare_for_write(struct dm_block_validator *v,
BITMAP_CSUM_XOR));
}
-static int dm_bitmap_check(struct dm_block_validator *v,
+static int dm_bitmap_check(const struct dm_block_validator *v,
struct dm_block *b,
size_t block_size)
{
@@ -109,7 +109,7 @@ static int dm_bitmap_check(struct dm_block_validator *v,
return 0;
}
-static struct dm_block_validator dm_sm_bitmap_validator = {
+static const struct dm_block_validator dm_sm_bitmap_validator = {
.name = "sm_bitmap",
.prepare_for_write = dm_bitmap_prepare_for_write,
.check = dm_bitmap_check,
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 04698fd03e60..d48c4fafc779 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -277,7 +277,7 @@ static void sm_metadata_destroy(struct dm_space_map *sm)
{
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
- kfree(smm);
+ kvfree(smm);
}
static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
@@ -772,7 +772,7 @@ struct dm_space_map *dm_sm_metadata_init(void)
{
struct sm_metadata *smm;
- smm = kmalloc(sizeof(*smm), GFP_KERNEL);
+ smm = kvmalloc(sizeof(*smm), GFP_KERNEL);
if (!smm)
return ERR_PTR(-ENOMEM);
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index c88fa6266203..98c745d90f48 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -13,6 +13,7 @@
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/hash.h>
+#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/device-mapper.h>
@@ -77,7 +78,7 @@ static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm)
/*----------------------------------------------------------------*/
struct shadow_info {
- struct hlist_node hlist;
+ struct rb_node node;
dm_block_t where;
};
@@ -95,7 +96,7 @@ struct dm_transaction_manager {
struct dm_space_map *sm;
spinlock_t lock;
- struct hlist_head buckets[DM_HASH_SIZE];
+ struct rb_root buckets[DM_HASH_SIZE];
struct prefetch_set prefetches;
};
@@ -106,14 +107,22 @@ static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
{
int r = 0;
unsigned int bucket = dm_hash_block(b, DM_HASH_MASK);
- struct shadow_info *si;
+ struct rb_node **node;
spin_lock(&tm->lock);
- hlist_for_each_entry(si, tm->buckets + bucket, hlist)
- if (si->where == b) {
+ node = &tm->buckets[bucket].rb_node;
+ while (*node) {
+ struct shadow_info *si =
+ rb_entry(*node, struct shadow_info, node);
+ if (b == si->where) {
r = 1;
break;
}
+ if (b < si->where)
+ node = &si->node.rb_left;
+ else
+ node = &si->node.rb_right;
+ }
spin_unlock(&tm->lock);
return r;
@@ -130,30 +139,41 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
si = kmalloc(sizeof(*si), GFP_NOIO);
if (si) {
+ struct rb_node **node, *parent;
si->where = b;
bucket = dm_hash_block(b, DM_HASH_MASK);
+
spin_lock(&tm->lock);
- hlist_add_head(&si->hlist, tm->buckets + bucket);
+ node = &tm->buckets[bucket].rb_node;
+ parent = NULL;
+ while (*node) {
+ struct shadow_info *si =
+ rb_entry(*node, struct shadow_info, node);
+ parent = *node;
+ if (b < si->where)
+ node = &si->node.rb_left;
+ else
+ node = &si->node.rb_right;
+ }
+ rb_link_node(&si->node, parent, node);
+ rb_insert_color(&si->node, &tm->buckets[bucket]);
spin_unlock(&tm->lock);
}
}
static void wipe_shadow_table(struct dm_transaction_manager *tm)
{
- struct shadow_info *si;
- struct hlist_node *tmp;
- struct hlist_head *bucket;
- int i;
+ unsigned int i;
spin_lock(&tm->lock);
for (i = 0; i < DM_HASH_SIZE; i++) {
- bucket = tm->buckets + i;
- hlist_for_each_entry_safe(si, tmp, bucket, hlist)
+ while (!RB_EMPTY_ROOT(&tm->buckets[i])) {
+ struct shadow_info *si =
+ rb_entry(tm->buckets[i].rb_node, struct shadow_info, node);
+ rb_erase(&si->node, &tm->buckets[i]);
kfree(si);
-
- INIT_HLIST_HEAD(bucket);
+ }
}
-
spin_unlock(&tm->lock);
}
@@ -162,7 +182,7 @@ static void wipe_shadow_table(struct dm_transaction_manager *tm)
static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
struct dm_space_map *sm)
{
- int i;
+ unsigned int i;
struct dm_transaction_manager *tm;
tm = kmalloc(sizeof(*tm), GFP_KERNEL);
@@ -176,7 +196,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
spin_lock_init(&tm->lock);
for (i = 0; i < DM_HASH_SIZE; i++)
- INIT_HLIST_HEAD(tm->buckets + i);
+ tm->buckets[i] = RB_ROOT;
prefetch_init(&tm->prefetches);
@@ -237,7 +257,7 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
EXPORT_SYMBOL_GPL(dm_tm_commit);
int dm_tm_new_block(struct dm_transaction_manager *tm,
- struct dm_block_validator *v,
+ const struct dm_block_validator *v,
struct dm_block **result)
{
int r;
@@ -266,7 +286,7 @@ int dm_tm_new_block(struct dm_transaction_manager *tm,
}
static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
- struct dm_block_validator *v,
+ const struct dm_block_validator *v,
struct dm_block **result)
{
int r;
@@ -306,7 +326,7 @@ static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
}
int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
- struct dm_block_validator *v, struct dm_block **result,
+ const struct dm_block_validator *v, struct dm_block **result,
int *inc_children)
{
int r;
@@ -331,7 +351,7 @@ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
EXPORT_SYMBOL_GPL(dm_tm_shadow_block);
int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
- struct dm_block_validator *v,
+ const struct dm_block_validator *v,
struct dm_block **blk)
{
if (tm->is_clone) {
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 01f7e650118d..61a8d10825ca 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -64,7 +64,7 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock)
* Zeroes the new block and returns with write lock held.
*/
int dm_tm_new_block(struct dm_transaction_manager *tm,
- struct dm_block_validator *v,
+ const struct dm_block_validator *v,
struct dm_block **result);
/*
@@ -84,7 +84,7 @@ int dm_tm_new_block(struct dm_transaction_manager *tm,
* it locked when you call this.
*/
int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
- struct dm_block_validator *v,
+ const struct dm_block_validator *v,
struct dm_block **result, int *inc_children);
/*
@@ -92,7 +92,7 @@ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
* on it outstanding then it'll block.
*/
int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
- struct dm_block_validator *v,
+ const struct dm_block_validator *v,
struct dm_block **result);
void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c5d4aeb68404..8fc9339b00c7 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -365,30 +365,31 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
return array_sectors;
}
-static void free_conf(struct mddev *mddev, struct r0conf *conf)
-{
- kfree(conf->strip_zone);
- kfree(conf->devlist);
- kfree(conf);
-}
-
static void raid0_free(struct mddev *mddev, void *priv)
{
struct r0conf *conf = priv;
- free_conf(mddev, conf);
+ kfree(conf->strip_zone);
+ kfree(conf->devlist);
+ kfree(conf);
}
static int raid0_set_limits(struct mddev *mddev)
{
struct queue_limits lim;
+ int err;
- blk_set_stacking_limits(&lim);
+ md_init_stacking_limits(&lim);
lim.max_hw_sectors = mddev->chunk_sectors;
lim.max_write_zeroes_sectors = mddev->chunk_sectors;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * mddev->raid_disks;
- mddev_stack_rdev_limits(mddev, &lim);
+ lim.features |= BLK_FEAT_ATOMIC_WRITES;
+ err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
+ if (err) {
+ queue_limits_cancel_update(mddev->gendisk->queue);
+ return err;
+ }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
@@ -415,7 +416,7 @@ static int raid0_run(struct mddev *mddev)
if (!mddev_is_dm(mddev)) {
ret = raid0_set_limits(mddev);
if (ret)
- goto out_free_conf;
+ return ret;
}
/* calculate array device size */
@@ -427,13 +428,7 @@ static int raid0_run(struct mddev *mddev)
dump_zones(mddev);
- ret = md_integrity_register(mddev);
- if (ret)
- goto out_free_conf;
- return 0;
-out_free_conf:
- free_conf(mddev, conf);
- return ret;
+ return md_integrity_register(mddev);
}
/*
@@ -472,6 +467,12 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
struct bio *split = bio_split(bio,
zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
&mddev->bio_set);
+
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return;
+ }
bio_chain(split, bio);
submit_bio_noacct(bio);
bio = split;
@@ -614,6 +615,12 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
if (sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, sectors, GFP_NOIO,
&mddev->bio_set);
+
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return true;
+ }
bio_chain(split, bio);
raid0_map_submit_bio(mddev, bio);
bio = split;
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 2ea1710a3b70..4378d3250bd7 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
* If bitmap is not enabled, it's safe to submit the io directly, and
* this can get optimal performance.
*/
- if (!md_bitmap_enabled(mddev->bitmap)) {
+ if (!mddev->bitmap_ops->enabled(mddev)) {
raid1_submit_write(bio);
return true;
}
@@ -166,12 +166,9 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
* while current io submission must wait for bitmap io to be done. In order to
* avoid such deadlock, submit bitmap io asynchronously.
*/
-static inline void raid1_prepare_flush_writes(struct bitmap *bitmap)
+static inline void raid1_prepare_flush_writes(struct mddev *mddev)
{
- if (current->bio_list)
- md_bitmap_unplug_async(bitmap);
- else
- md_bitmap_unplug(bitmap);
+ mddev->bitmap_ops->unplug(mddev, current->bio_list == NULL);
}
/*
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7b8a71ca66dd..9d57a88dbd26 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -411,18 +411,18 @@ static void raid1_end_read_request(struct bio *bio)
static void close_write(struct r1bio *r1_bio)
{
+ struct mddev *mddev = r1_bio->mddev;
+
/* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
bio_free_pages(r1_bio->behind_master_bio);
bio_put(r1_bio->behind_master_bio);
r1_bio->behind_master_bio = NULL;
}
- /* clear the bitmap if all writes complete successfully */
- md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
- r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state),
- test_bit(R1BIO_BehindIO, &r1_bio->state));
- md_write_end(r1_bio->mddev);
+
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->end_behind_write(mddev);
+ md_write_end(mddev);
}
static void r1_bio_write_done(struct r1bio *r1_bio)
@@ -478,8 +478,6 @@ static void raid1_end_write_request(struct bio *bio)
if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
else {
- /* Fail the request */
- set_bit(R1BIO_Degraded, &r1_bio->state);
/* Finished with this branch */
r1_bio->bios[mirror] = NULL;
to_put = bio;
@@ -617,6 +615,12 @@ static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio,
return -1;
}
+static bool rdev_in_recovery(struct md_rdev *rdev, struct r1bio *r1_bio)
+{
+ return !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < r1_bio->sector + r1_bio->sectors;
+}
+
static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
@@ -635,6 +639,7 @@ static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
rdev = conf->mirrors[disk].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags) ||
+ rdev_in_recovery(rdev, r1_bio) ||
test_bit(WriteMostly, &rdev->flags))
continue;
@@ -673,13 +678,15 @@ static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
rdev = conf->mirrors[disk].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags) ||
- !test_bit(WriteMostly, &rdev->flags))
+ !test_bit(WriteMostly, &rdev->flags) ||
+ rdev_in_recovery(rdev, r1_bio))
continue;
/* there are no bad blocks, we can use this disk */
len = r1_bio->sectors;
read_len = raid1_check_read_range(rdev, this_sector, &len);
if (read_len == r1_bio->sectors) {
+ *max_sectors = read_len;
update_read_sectors(conf, disk, this_sector, read_len);
return disk;
}
@@ -732,9 +739,7 @@ static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
if (!rdev || test_bit(Faulty, &rdev->flags))
return false;
- /* still in recovery */
- if (!test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
+ if (rdev_in_recovery(rdev, r1_bio))
return false;
/* don't read from slow disk unless have to */
@@ -893,7 +898,7 @@ static void wake_up_barrier(struct r1conf *conf)
static void flush_bio_list(struct r1conf *conf, struct bio *bio)
{
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
- raid1_prepare_flush_writes(conf->mddev->bitmap);
+ raid1_prepare_flush_writes(conf->mddev);
wake_up_barrier(conf);
while (bio) { /* submit pending writes */
@@ -1310,13 +1315,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
struct r1conf *conf = mddev->private;
struct raid1_info *mirror;
struct bio *read_bio;
- struct bitmap *bitmap = mddev->bitmap;
const enum req_op op = bio_op(bio);
const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
int max_sectors;
- int rdisk;
+ int rdisk, error;
bool r1bio_existed = !!r1_bio;
- char b[BDEVNAME_SIZE];
/*
* If r1_bio is set, we are blocking the raid1d thread
@@ -1325,16 +1328,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
*/
gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
- if (r1bio_existed) {
- /* Need to get the block device name carefully */
- struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
-
- if (rdev)
- snprintf(b, sizeof(b), "%pg", rdev->bdev);
- else
- strcpy(b, "???");
- }
-
/*
* Still need barrier for READ in case that whole
* array is frozen.
@@ -1356,15 +1349,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
* used and no empty request is available.
*/
rdisk = read_balance(conf, r1_bio, &max_sectors);
-
if (rdisk < 0) {
/* couldn't find anywhere to read from */
- if (r1bio_existed) {
- pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
+ if (r1bio_existed)
+ pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
mdname(mddev),
- b,
- (unsigned long long)r1_bio->sector);
- }
+ conf->mirrors[r1_bio->read_disk].rdev->bdev,
+ r1_bio->sector);
raid_end_bio_io(r1_bio);
return;
}
@@ -1376,20 +1367,23 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
(unsigned long long)r1_bio->sector,
mirror->rdev->bdev);
- if (test_bit(WriteMostly, &mirror->rdev->flags) &&
- bitmap) {
+ if (test_bit(WriteMostly, &mirror->rdev->flags)) {
/*
* Reading from a write-mostly device must take care not to
* over-take any writes that are 'behind'
*/
mddev_add_trace_msg(mddev, "raid1 wait behind writes");
- wait_event(bitmap->behind_wait,
- atomic_read(&bitmap->behind_writes) == 0);
+ mddev->bitmap_ops->wait_behind_writes(mddev);
}
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
gfp, &conf->bio_split);
+
+ if (IS_ERR(split)) {
+ error = PTR_ERR(split);
+ goto err_handle;
+ }
bio_chain(split, bio);
submit_bio_noacct(bio);
bio = split;
@@ -1417,6 +1411,47 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_private = r1_bio;
mddev_trace_remap(mddev, read_bio, r1_bio->sector);
submit_bio_noacct(read_bio);
+ return;
+
+err_handle:
+ atomic_dec(&mirror->rdev->nr_pending);
+ bio->bi_status = errno_to_blk_status(error);
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ raid_end_bio_io(r1_bio);
+}
+
+static bool wait_blocked_rdev(struct mddev *mddev, struct bio *bio)
+{
+ struct r1conf *conf = mddev->private;
+ int disks = conf->raid_disks * 2;
+ int i;
+
+retry:
+ for (i = 0; i < disks; i++) {
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
+ if (!rdev)
+ continue;
+
+ /* don't write here until the bad block is acknowledged */
+ if (test_bit(WriteErrorSeen, &rdev->flags) &&
+ rdev_has_badblock(rdev, bio->bi_iter.bi_sector,
+ bio_sectors(bio)) < 0)
+ set_bit(BlockedBadBlocks, &rdev->flags);
+
+ if (rdev_blocked(rdev)) {
+ if (bio->bi_opf & REQ_NOWAIT)
+ return false;
+
+ mddev_add_trace_msg(rdev->mddev, "raid1 wait rdev %d blocked",
+ rdev->raid_disk);
+ atomic_inc(&rdev->nr_pending);
+ md_wait_for_blocked_rdev(rdev, rdev->mddev);
+ goto retry;
+ }
+ }
+
+ return true;
}
static void raid1_write_request(struct mddev *mddev, struct bio *bio,
@@ -1424,10 +1459,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
- int i, disks;
- struct bitmap *bitmap = mddev->bitmap;
+ int i, disks, k, error;
unsigned long flags;
- struct md_rdev *blocked_rdev;
int first_clone;
int max_sectors;
bool write_behind = false;
@@ -1465,7 +1498,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
return;
}
- retry_write:
+ if (!wait_blocked_rdev(mddev, bio)) {
+ bio_wouldblock_error(bio);
+ return;
+ }
+
r1_bio = alloc_r1bio(mddev, bio);
r1_bio->sectors = max_write_sectors;
@@ -1481,7 +1518,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
*/
disks = conf->raid_disks * 2;
- blocked_rdev = NULL;
max_sectors = r1_bio->sectors;
for (i = 0; i < disks; i++) {
struct md_rdev *rdev = conf->mirrors[i].rdev;
@@ -1494,17 +1530,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags))
write_behind = true;
- if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
- atomic_inc(&rdev->nr_pending);
- blocked_rdev = rdev;
- break;
- }
r1_bio->bios[i] = NULL;
- if (!rdev || test_bit(Faulty, &rdev->flags)) {
- if (i < conf->raid_disks)
- set_bit(R1BIO_Degraded, &r1_bio->state);
+ if (!rdev || test_bit(Faulty, &rdev->flags))
continue;
- }
atomic_inc(&rdev->nr_pending);
if (test_bit(WriteErrorSeen, &rdev->flags)) {
@@ -1514,13 +1542,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
&first_bad, &bad_sectors);
- if (is_bad < 0) {
- /* mustn't write here until the bad block is
- * acknowledged*/
- set_bit(BlockedBadBlocks, &rdev->flags);
- blocked_rdev = rdev;
- break;
- }
if (is_bad && first_bad <= r1_bio->sector) {
/* Cannot write here at all */
bad_sectors -= (r1_bio->sector - first_bad);
@@ -1530,20 +1551,24 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
*/
max_sectors = bad_sectors;
rdev_dec_pending(rdev, mddev);
- /* We don't set R1BIO_Degraded as that
- * only applies if the disk is
- * missing, so it might be re-added,
- * and we want to know to recover this
- * chunk.
- * In this case the device is here,
- * and the fact that this chunk is not
- * in-sync is recorded in the bad
- * block log
- */
continue;
}
if (is_bad) {
- int good_sectors = first_bad - r1_bio->sector;
+ int good_sectors;
+
+ /*
+ * We cannot atomically write this, so just
+ * error in that case. It could be possible to
+ * atomically write other mirrors, but the
+ * complexity of supporting that is not worth
+ * the benefit.
+ */
+ if (bio->bi_opf & REQ_ATOMIC) {
+ error = -EIO;
+ goto err_handle;
+ }
+
+ good_sectors = first_bad - r1_bio->sector;
if (good_sectors < max_sectors)
max_sectors = good_sectors;
}
@@ -1551,39 +1576,23 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio->bios[i] = bio;
}
- if (unlikely(blocked_rdev)) {
- /* Wait for this device to become unblocked */
- int j;
-
- for (j = 0; j < i; j++)
- if (r1_bio->bios[j])
- rdev_dec_pending(conf->mirrors[j].rdev, mddev);
- mempool_free(r1_bio, &conf->r1bio_pool);
- allow_barrier(conf, bio->bi_iter.bi_sector);
-
- if (bio->bi_opf & REQ_NOWAIT) {
- bio_wouldblock_error(bio);
- return;
- }
- mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked",
- blocked_rdev->raid_disk);
- md_wait_for_blocked_rdev(blocked_rdev, mddev);
- wait_barrier(conf, bio->bi_iter.bi_sector, false);
- goto retry_write;
- }
-
/*
* When using a bitmap, we may call alloc_behind_master_bio below.
* alloc_behind_master_bio allocates a copy of the data payload a page
* at a time and thus needs a new bio that can fit the whole payload
* this bio in page sized chunks.
*/
- if (write_behind && bitmap)
+ if (write_behind && mddev->bitmap)
max_sectors = min_t(int, max_sectors,
BIO_MAX_VECS * (PAGE_SIZE >> 9));
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
GFP_NOIO, &conf->bio_split);
+
+ if (IS_ERR(split)) {
+ error = PTR_ERR(split);
+ goto err_handle;
+ }
bio_chain(split, bio);
submit_bio_noacct(bio);
bio = split;
@@ -1605,19 +1614,22 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
continue;
if (first_clone) {
+ unsigned long max_write_behind =
+ mddev->bitmap_info.max_write_behind;
+ struct md_bitmap_stats stats;
+ int err;
+
/* do behind I/O ?
* Not if there are too many, or cannot
* allocate memory, or a reader on WriteMostly
* is waiting for behind writes to flush */
- if (bitmap && write_behind &&
- (atomic_read(&bitmap->behind_writes)
- < mddev->bitmap_info.max_write_behind) &&
- !waitqueue_active(&bitmap->behind_wait)) {
+ err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (!err && write_behind && !stats.behind_wait &&
+ stats.behind_writes < max_write_behind)
alloc_behind_master_bio(r1_bio, bio);
- }
- md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors,
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->start_behind_write(mddev);
first_clone = 0;
}
@@ -1641,7 +1653,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset);
mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
+ mbio->bi_opf = bio_op(bio) |
+ (bio->bi_opf & (REQ_SYNC | REQ_FUA | REQ_ATOMIC));
if (test_bit(FailFast, &rdev->flags) &&
!test_bit(WriteMostly, &rdev->flags) &&
conf->raid_disks - mddev->degraded > 1)
@@ -1664,6 +1677,18 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
/* In case raid1d snuck in to freeze_array */
wake_up_barrier(conf);
+ return;
+err_handle:
+ for (k = 0; k < i; k++) {
+ if (r1_bio->bios[k]) {
+ rdev_dec_pending(conf->mirrors[k].rdev, mddev);
+ r1_bio->bios[k] = NULL;
+ }
+ }
+
+ bio->bi_status = errno_to_blk_status(error);
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ raid_end_bio_io(r1_bio);
}
static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
@@ -1687,8 +1712,7 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
if (bio_data_dir(bio) == READ)
raid1_read_request(mddev, bio, sectors, NULL);
else {
- if (!md_write_start(mddev,bio))
- return false;
+ md_write_start(mddev,bio);
raid1_write_request(mddev, bio, sectors);
}
return true;
@@ -1907,9 +1931,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
- if (md_integrity_add_rdev(rdev, mddev))
- return -ENXIO;
-
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
@@ -2039,7 +2060,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
/* make sure these bits don't get cleared. */
do {
- md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
+ mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks);
s += sync_blocks;
sectors_to_go -= sync_blocks;
} while (sectors_to_go > 0);
@@ -2575,12 +2596,10 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
* errors.
*/
fail = true;
- if (!narrow_write_error(r1_bio, m)) {
+ if (!narrow_write_error(r1_bio, m))
md_error(conf->mddev,
conf->mirrors[m].rdev);
/* an I/O failed, we can't clear the bitmap */
- set_bit(R1BIO_Degraded, &r1_bio->state);
- }
rdev_dec_pending(conf->mirrors[m].rdev,
conf->mddev);
}
@@ -2671,8 +2690,6 @@ static void raid1d(struct md_thread *thread)
list_del(&r1_bio->retry_list);
idx = sector_to_idx(r1_bio->sector);
atomic_dec(&conf->nr_queued[idx]);
- if (mddev->degraded)
- set_bit(R1BIO_Degraded, &r1_bio->state);
if (test_bit(R1BIO_WriteError, &r1_bio->state))
close_write(r1_bio);
raid_end_bio_io(r1_bio);
@@ -2757,18 +2774,18 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
*/
static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
- int *skipped)
+ sector_t max_sector, int *skipped)
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
struct bio *bio;
- sector_t max_sector, nr_sectors;
+ sector_t nr_sectors;
int disk = -1;
int i;
int wonly = -1;
int write_targets = 0, read_targets = 0;
sector_t sync_blocks;
- int still_degraded = 0;
+ bool still_degraded = false;
int good_sectors = RESYNC_SECTORS;
int min_bad = 0; /* number of sectors that are bad in all devices */
int idx = sector_to_idx(sector_nr);
@@ -2778,7 +2795,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (init_resync(conf))
return 0;
- max_sector = mddev->dev_sectors;
if (sector_nr >= max_sector) {
/* If we aborted, we need to abort the
* sync on the 'current' bitmap chunk (there will
@@ -2786,12 +2802,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* We can find the current addess in mddev->curr_resync
*/
if (mddev->curr_resync < max_sector) /* aborted */
- md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
- &sync_blocks, 1);
+ mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else /* completed sync */
conf->fullsync = 0;
- md_bitmap_close_sync(mddev->bitmap);
+ mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
if (mddev_is_clustered(mddev)) {
@@ -2811,7 +2827,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
/* before building a request, check if we can skip these blocks..
* This call the bitmap_start_sync doesn't actually record anything
*/
- if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We can skip this block, and probably several more */
*skipped = 1;
@@ -2829,9 +2845,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* sector_nr + two times RESYNC_SECTORS
*/
- md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
- mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
-
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
+ mddev_is_clustered(mddev) &&
+ (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
if (raise_barrier(conf, sector_nr))
return 0;
@@ -2862,7 +2878,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
- still_degraded = 1;
+ still_degraded = true;
} else if (!test_bit(In_sync, &rdev->flags)) {
bio->bi_opf = REQ_OP_WRITE;
bio->bi_end_io = end_sync_write;
@@ -2986,8 +3002,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (len == 0)
break;
if (sync_blocks == 0) {
- if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
- &sync_blocks, still_degraded) &&
+ if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
+ &sync_blocks, still_degraded) &&
!conf->fullsync &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
break;
@@ -3197,14 +3213,19 @@ static struct r1conf *setup_conf(struct mddev *mddev)
static int raid1_set_limits(struct mddev *mddev)
{
struct queue_limits lim;
+ int err;
- blk_set_stacking_limits(&lim);
+ md_init_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
- mddev_stack_rdev_limits(mddev, &lim);
+ lim.features |= BLK_FEAT_ATOMIC_WRITES;
+ err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
+ if (err) {
+ queue_limits_cancel_update(mddev->gendisk->queue);
+ return err;
+ }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
-static void raid1_free(struct mddev *mddev, void *priv);
static int raid1_run(struct mddev *mddev)
{
struct r1conf *conf;
@@ -3238,7 +3259,7 @@ static int raid1_run(struct mddev *mddev)
if (!mddev_is_dm(mddev)) {
ret = raid1_set_limits(mddev);
if (ret)
- goto abort;
+ return ret;
}
mddev->degraded = 0;
@@ -3252,8 +3273,7 @@ static int raid1_run(struct mddev *mddev)
*/
if (conf->raid_disks - mddev->degraded < 1) {
md_unregister_thread(mddev, &conf->thread);
- ret = -EINVAL;
- goto abort;
+ return -EINVAL;
}
if (conf->raid_disks - mddev->degraded == 1)
@@ -3277,14 +3297,8 @@ static int raid1_run(struct mddev *mddev)
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
ret = md_integrity_register(mddev);
- if (ret) {
+ if (ret)
md_unregister_thread(mddev, &mddev->thread);
- goto abort;
- }
- return 0;
-
-abort:
- raid1_free(mddev, conf);
return ret;
}
@@ -3314,14 +3328,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
* worth it.
*/
sector_t newsize = raid1_size(mddev, sectors, 0);
+ int ret;
+
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
- if (mddev->bitmap) {
- int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
- if (ret)
- return ret;
- }
+
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
+ if (ret)
+ return ret;
+
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5300cbaa58a4..33f318fcc268 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -188,7 +188,6 @@ struct r1bio {
enum r1bio_state {
R1BIO_Uptodate,
R1BIO_IsSync,
- R1BIO_Degraded,
R1BIO_BehindIO,
/* Set ReadError on bios that experience a readerror so that
* raid1d knows what to do with them.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a4556d2e46bf..efe93b979167 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -426,12 +426,9 @@ static void raid10_end_read_request(struct bio *bio)
static void close_write(struct r10bio *r10_bio)
{
- /* clear the bitmap if all writes complete successfully */
- md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
- r10_bio->sectors,
- !test_bit(R10BIO_Degraded, &r10_bio->state),
- 0);
- md_write_end(r10_bio->mddev);
+ struct mddev *mddev = r10_bio->mddev;
+
+ md_write_end(mddev);
}
static void one_write_done(struct r10bio *r10_bio)
@@ -500,7 +497,6 @@ static void raid10_end_write_request(struct bio *bio)
set_bit(R10BIO_WriteError, &r10_bio->state);
else {
/* Fail the request */
- set_bit(R10BIO_Degraded, &r10_bio->state);
r10_bio->devs[slot].bio = NULL;
to_put = bio;
dec_rdev = 1;
@@ -884,7 +880,7 @@ static void flush_pending_writes(struct r10conf *conf)
__set_current_state(TASK_RUNNING);
blk_start_plug(&plug);
- raid1_prepare_flush_writes(conf->mddev->bitmap);
+ raid1_prepare_flush_writes(conf->mddev);
wake_up(&conf->wait_barrier);
while (bio) { /* submit pending writes */
@@ -1100,7 +1096,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
- raid1_prepare_flush_writes(mddev->bitmap);
+ raid1_prepare_flush_writes(mddev);
wake_up_barrier(conf);
while (bio) { /* submit pending writes */
@@ -1158,6 +1154,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
int slot = r10_bio->read_slot;
struct md_rdev *err_rdev = NULL;
gfp_t gfp = GFP_NOIO;
+ int error;
if (slot >= 0 && r10_bio->devs[slot].rdev) {
/*
@@ -1205,6 +1202,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
gfp, &conf->bio_split);
+ if (IS_ERR(split)) {
+ error = PTR_ERR(split);
+ goto err_handle;
+ }
bio_chain(split, bio);
allow_barrier(conf);
submit_bio_noacct(bio);
@@ -1235,6 +1236,11 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
mddev_trace_remap(mddev, read_bio, r10_bio->sector);
submit_bio_noacct(read_bio);
return;
+err_handle:
+ atomic_dec(&rdev->nr_pending);
+ bio->bi_status = errno_to_blk_status(error);
+ set_bit(R10BIO_Uptodate, &r10_bio->state);
+ raid_end_bio_io(r10_bio);
}
static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
@@ -1244,6 +1250,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
const enum req_op op = bio_op(bio);
const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
+ const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC;
unsigned long flags;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev;
@@ -1262,7 +1269,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
choose_data_offset(r10_bio, rdev));
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_opf = op | do_sync | do_fua;
+ mbio->bi_opf = op | do_sync | do_fua | do_atomic;
if (!replacement && test_bit(FailFast,
&conf->mirrors[devnum].rdev->flags)
&& enough(conf, devnum))
@@ -1284,9 +1291,9 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
{
- int i;
struct r10conf *conf = mddev->private;
struct md_rdev *blocked_rdev;
+ int i;
retry_wait:
blocked_rdev = NULL;
@@ -1294,40 +1301,36 @@ retry_wait:
struct md_rdev *rdev, *rrdev;
rdev = conf->mirrors[i].rdev;
- rrdev = conf->mirrors[i].replacement;
- if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
- atomic_inc(&rdev->nr_pending);
- blocked_rdev = rdev;
- break;
- }
- if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
- atomic_inc(&rrdev->nr_pending);
- blocked_rdev = rrdev;
- break;
- }
-
- if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
+ if (rdev) {
sector_t dev_sector = r10_bio->devs[i].addr;
/*
* Discard request doesn't care the write result
* so it doesn't need to wait blocked disk here.
*/
- if (!r10_bio->sectors)
- continue;
-
- if (rdev_has_badblock(rdev, dev_sector,
- r10_bio->sectors) < 0) {
+ if (test_bit(WriteErrorSeen, &rdev->flags) &&
+ r10_bio->sectors &&
+ rdev_has_badblock(rdev, dev_sector,
+ r10_bio->sectors) < 0)
/*
- * Mustn't write here until the bad block
- * is acknowledged
+ * Mustn't write here until the bad
+ * block is acknowledged
*/
- atomic_inc(&rdev->nr_pending);
set_bit(BlockedBadBlocks, &rdev->flags);
+
+ if (rdev_blocked(rdev)) {
blocked_rdev = rdev;
+ atomic_inc(&rdev->nr_pending);
break;
}
}
+
+ rrdev = conf->mirrors[i].replacement;
+ if (rrdev && rdev_blocked(rrdev)) {
+ atomic_inc(&rrdev->nr_pending);
+ blocked_rdev = rrdev;
+ break;
+ }
}
if (unlikely(blocked_rdev)) {
@@ -1346,9 +1349,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
struct r10bio *r10_bio)
{
struct r10conf *conf = mddev->private;
- int i;
+ int i, k;
sector_t sectors;
int max_sectors;
+ int error;
if ((mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
@@ -1429,10 +1433,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->devs[i].bio = NULL;
r10_bio->devs[i].repl_bio = NULL;
- if (!rdev && !rrdev) {
- set_bit(R10BIO_Degraded, &r10_bio->state);
+ if (!rdev && !rrdev)
continue;
- }
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
@@ -1449,18 +1451,24 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
* to other devices yet
*/
max_sectors = bad_sectors;
- /* We don't set R10BIO_Degraded as that
- * only applies if the disk is missing,
- * so it might be re-added, and we want to
- * know to recover this chunk.
- * In this case the device is here, and the
- * fact that this chunk is not in-sync is
- * recorded in the bad block log.
- */
continue;
}
if (is_bad) {
- int good_sectors = first_bad - dev_sector;
+ int good_sectors;
+
+ /*
+ * We cannot atomically write this, so just
+ * error in that case. It could be possible to
+ * atomically write other mirrors, but the
+ * complexity of supporting that is not worth
+ * the benefit.
+ */
+ if (bio->bi_opf & REQ_ATOMIC) {
+ error = -EIO;
+ goto err_handle;
+ }
+
+ good_sectors = first_bad - dev_sector;
if (good_sectors < max_sectors)
max_sectors = good_sectors;
}
@@ -1481,6 +1489,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
if (r10_bio->sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, r10_bio->sectors,
GFP_NOIO, &conf->bio_split);
+ if (IS_ERR(split)) {
+ error = PTR_ERR(split);
+ goto err_handle;
+ }
bio_chain(split, bio);
allow_barrier(conf);
submit_bio_noacct(bio);
@@ -1492,7 +1504,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
- md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
@@ -1501,6 +1512,26 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
raid10_write_one_disk(mddev, r10_bio, bio, true, i);
}
one_write_done(r10_bio);
+ return;
+err_handle:
+ for (k = 0; k < i; k++) {
+ int d = r10_bio->devs[k].devnum;
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
+ struct md_rdev *rrdev = conf->mirrors[d].replacement;
+
+ if (r10_bio->devs[k].bio) {
+ rdev_dec_pending(rdev, mddev);
+ r10_bio->devs[k].bio = NULL;
+ }
+ if (r10_bio->devs[k].repl_bio) {
+ rdev_dec_pending(rrdev, mddev);
+ r10_bio->devs[k].repl_bio = NULL;
+ }
+ }
+
+ bio->bi_status = errno_to_blk_status(error);
+ set_bit(R10BIO_Uptodate, &r10_bio->state);
+ raid_end_bio_io(r10_bio);
}
static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
@@ -1642,6 +1673,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
if (remainder) {
split_size = stripe_size - remainder;
split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return 0;
+ }
bio_chain(split, bio);
allow_barrier(conf);
/* Resend the fist split part */
@@ -1652,6 +1688,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
if (remainder) {
split_size = bio_sectors(bio) - remainder;
split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return 0;
+ }
bio_chain(split, bio);
allow_barrier(conf);
/* Resend the second split part */
@@ -1836,8 +1877,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
&& md_flush_request(mddev, bio))
return true;
- if (!md_write_start(mddev, bio))
- return false;
+ md_write_start(mddev, bio);
if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
if (!raid10_handle_discard(mddev, bio))
@@ -2083,9 +2123,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
return -EINVAL;
- if (md_integrity_add_rdev(rdev, mddev))
- return -ENXIO;
-
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
@@ -2469,7 +2506,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
s = PAGE_SIZE >> 9;
rdev = conf->mirrors[dr].rdev;
- addr = r10_bio->devs[0].addr + sect,
+ addr = r10_bio->devs[0].addr + sect;
ok = sync_page_io(rdev,
addr,
s << 9,
@@ -2912,11 +2949,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev_dec_pending(rdev, conf->mddev);
} else if (bio != NULL && bio->bi_status) {
fail = true;
- if (!narrow_write_error(r10_bio, m)) {
+ if (!narrow_write_error(r10_bio, m))
md_error(conf->mddev, rdev);
- set_bit(R10BIO_Degraded,
- &r10_bio->state);
- }
rdev_dec_pending(rdev, conf->mddev);
}
bio = r10_bio->devs[m].repl_bio;
@@ -2975,8 +3009,6 @@ static void raid10d(struct md_thread *thread)
r10_bio = list_first_entry(&tmp, struct r10bio,
retry_list);
list_del(&r10_bio->retry_list);
- if (mddev->degraded)
- set_bit(R10BIO_Degraded, &r10_bio->state);
if (test_bit(R10BIO_WriteError,
&r10_bio->state))
@@ -3140,12 +3172,12 @@ static void raid10_set_cluster_sync_high(struct r10conf *conf)
*/
static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
- int *skipped)
+ sector_t max_sector, int *skipped)
{
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
struct bio *biolist = NULL, *bio;
- sector_t max_sector, nr_sectors;
+ sector_t nr_sectors;
int i;
int max_sync;
sector_t sync_blocks;
@@ -3175,10 +3207,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
return 0;
skipped:
- max_sector = mddev->dev_sectors;
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
- test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
conf->cluster_sync_low = 0;
conf->cluster_sync_high = 0;
@@ -3200,13 +3228,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
- md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
- &sync_blocks, 1);
+ mddev->bitmap_ops->end_sync(mddev,
+ mddev->curr_resync,
+ &sync_blocks);
else for (i = 0; i < conf->geo.raid_disks; i++) {
sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i);
- md_bitmap_end_sync(mddev->bitmap, sect,
- &sync_blocks, 1);
+
+ mddev->bitmap_ops->end_sync(mddev, sect,
+ &sync_blocks);
}
} else {
/* completed sync */
@@ -3226,7 +3256,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
conf->fullsync = 0;
}
- md_bitmap_close_sync(mddev->bitmap);
+ mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
*skipped = 1;
return sectors_skipped;
@@ -3295,10 +3325,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio = NULL;
for (i = 0 ; i < conf->geo.raid_disks; i++) {
- int still_degraded;
+ bool still_degraded;
struct r10bio *rb2;
sector_t sect;
- int must_sync;
+ bool must_sync;
int any_working;
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;
@@ -3315,7 +3345,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (!mrdev && !mreplace)
continue;
- still_degraded = 0;
+ still_degraded = false;
/* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i);
@@ -3328,8 +3358,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* we only need to recover the block if it is set in
* the bitmap
*/
- must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
- &sync_blocks, 1);
+ must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
+ &sync_blocks,
+ true);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
@@ -3367,13 +3398,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
struct md_rdev *rdev = conf->mirrors[j].rdev;
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
- still_degraded = 1;
+ still_degraded = false;
break;
}
}
- must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
- &sync_blocks, still_degraded);
+ must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
+ &sync_blocks, still_degraded);
any_working = 0;
for (j=0; j<conf->copies;j++) {
@@ -3546,12 +3577,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* safety reason, which ensures curr_resync_completed is
* updated in bitmap_cond_end_sync.
*/
- md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
- if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
- &sync_blocks, mddev->degraded) &&
+ if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
+ &sync_blocks,
+ mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
&mddev->recovery)) {
/* We can skip this block */
@@ -3980,12 +4012,18 @@ static int raid10_set_queue_limits(struct mddev *mddev)
{
struct r10conf *conf = mddev->private;
struct queue_limits lim;
+ int err;
- blk_set_stacking_limits(&lim);
+ md_init_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
- mddev_stack_rdev_limits(mddev, &lim);
+ lim.features |= BLK_FEAT_ATOMIC_WRITES;
+ err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
+ if (err) {
+ queue_limits_cancel_update(mddev->gendisk->queue);
+ return err;
+ }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
@@ -4058,9 +4096,12 @@ static int raid10_run(struct mddev *mddev)
}
if (!mddev_is_dm(conf->mddev)) {
- ret = raid10_set_queue_limits(mddev);
- if (ret)
+ int err = raid10_set_queue_limits(mddev);
+
+ if (err) {
+ ret = err;
goto out_free_conf;
+ }
}
/* need to check that every block has at least one working mirror */
@@ -4193,6 +4234,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
*/
struct r10conf *conf = mddev->private;
sector_t oldsize, size;
+ int ret;
if (mddev->reshape_position != MaxSector)
return -EBUSY;
@@ -4205,11 +4247,11 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
if (mddev->external_size &&
mddev->array_sectors > size)
return -EINVAL;
- if (mddev->bitmap) {
- int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
- if (ret)
- return ret;
- }
+
+ ret = mddev->bitmap_ops->resize(mddev, size, 0, false);
+ if (ret)
+ return ret;
+
md_set_array_sectors(mddev, size);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > oldsize) {
@@ -4475,7 +4517,7 @@ static int raid10_start_reshape(struct mddev *mddev)
newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
if (!mddev_is_clustered(mddev)) {
- ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
if (ret)
goto abort;
else
@@ -4490,20 +4532,20 @@ static int raid10_start_reshape(struct mddev *mddev)
/*
* some node is already performing reshape, and no need to
- * call md_bitmap_resize again since it should be called when
+ * call bitmap_ops->resize again since it should be called when
* receiving BITMAP_RESIZE msg
*/
if ((sb && (le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
goto out;
- ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
if (ret)
goto abort;
ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
if (ret) {
- md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
+ mddev->bitmap_ops->resize(mddev, oldsize, 0, false);
goto abort;
}
}
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 2e75e88d0802..3f16ad6904a9 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -161,7 +161,6 @@ enum r10bio_state {
R10BIO_IsSync,
R10BIO_IsRecover,
R10BIO_IsReshape,
- R10BIO_Degraded,
/* Set ReadError on bios that experience a read error
* so that raid10d knows what to do with them.
*/
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 874874fe4fa1..e530271cb86b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -313,10 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- 0);
}
}
}
@@ -1023,10 +1019,10 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
/* checksum is already calculated in last run */
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
continue;
- addr = kmap_atomic(sh->dev[i].page);
+ addr = kmap_local_page(sh->dev[i].page);
sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
addr, PAGE_SIZE);
- kunmap_atomic(addr);
+ kunmap_local(addr);
}
parity_pages = 1 + !!(sh->qd_idx >= 0);
data_pages = write_disks - parity_pages;
@@ -1979,9 +1975,9 @@ r5l_recovery_verify_data_checksum(struct r5l_log *log,
u32 checksum;
r5l_recovery_read_page(log, ctx, page, log_offset);
- addr = kmap_atomic(page);
+ addr = kmap_local_page(page);
checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
- kunmap_atomic(addr);
+ kunmap_local(addr);
return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
}
@@ -2381,11 +2377,11 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
payload->size = cpu_to_le32(BLOCK_SECTORS);
payload->location = cpu_to_le64(
raid5_compute_blocknr(sh, i, 0));
- addr = kmap_atomic(dev->page);
+ addr = kmap_local_page(dev->page);
payload->checksum[0] = cpu_to_le32(
crc32c_le(log->uuid_checksum, addr,
PAGE_SIZE));
- kunmap_atomic(addr);
+ kunmap_local(addr);
sync_page_io(log->rdev, write_pos, PAGE_SIZE,
dev->page, REQ_OP_WRITE, false);
write_pos = r5l_ring_add(log, write_pos,
@@ -2798,7 +2794,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
{
struct r5l_log *log = READ_ONCE(conf->log);
int i;
- int do_wakeup = 0;
sector_t tree_index;
void __rcu **pslot;
uintptr_t refcount;
@@ -2815,7 +2810,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
for (i = sh->disks; i--; ) {
clear_bit(R5_InJournal, &sh->dev[i].flags);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- do_wakeup = 1;
+ wake_up_bit(&sh->dev[i].flags, R5_Overlap);
}
/*
@@ -2828,9 +2823,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
if (atomic_dec_and_test(&conf->pending_full_writes))
md_wakeup_thread(conf->mddev->thread);
- if (do_wakeup)
- wake_up(&conf->wait_for_overlap);
-
spin_lock_irq(&log->stripe_in_journal_lock);
list_del_init(&sh->r5c);
spin_unlock_irq(&log->stripe_in_journal_lock);
@@ -2892,10 +2884,10 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
continue;
- addr = kmap_atomic(sh->dev[i].page);
+ addr = kmap_local_page(sh->dev[i].page);
sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
addr, PAGE_SIZE);
- kunmap_atomic(addr);
+ kunmap_local(addr);
pages++;
}
WARN_ON(pages == 0);
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index a70cbec12ed0..37c4da5311ca 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -258,7 +258,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
pplhdr->signature = cpu_to_le32(ppl_conf->signature);
- io->seq = atomic64_add_return(1, &ppl_conf->seq);
+ io->seq = atomic64_inc_return(&ppl_conf->seq);
pplhdr->generation = cpu_to_le64(io->seq);
return io;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d874abfc1836..5c79429acc64 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -36,7 +36,6 @@
*/
#include <linux/blkdev.h>
-#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/async_tx.h>
@@ -156,7 +155,7 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
return slot;
}
-static void print_raid5_conf (struct r5conf *conf);
+static void print_raid5_conf(struct r5conf *conf);
static int stripe_operations_active(struct stripe_head *sh)
{
@@ -907,8 +906,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
- !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
- is_full_stripe_write(sh);
+ is_full_stripe_write(sh);
}
/* we only do back search */
@@ -1346,8 +1344,6 @@ again:
submit_bio_noacct(rbi);
}
if (!rdev && !rrdev) {
- if (op_is_write(op))
- set_bit(STRIPE_DEGRADED, &sh->state);
pr_debug("skip op %d on disc %d for sector %llu\n",
bi->bi_opf, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -2338,7 +2334,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_Overlap, &dev->flags))
- wake_up(&sh->raid_conf->wait_for_overlap);
+ wake_up_bit(&dev->flags, R5_Overlap);
}
}
local_unlock(&conf->percpu->lock);
@@ -2885,7 +2881,6 @@ static void raid5_end_write_request(struct bio *bi)
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (bi->bi_status) {
- set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@ -3474,7 +3469,7 @@ static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
* With PPL only writes to consecutive data chunks within a
* stripe are allowed because for a single stripe_head we can
* only have one PPL entry at a time, which describes one data
- * range. Not really an overlap, but wait_for_overlap can be
+ * range. Not really an overlap, but R5_Overlap can be
* used to handle this.
*/
sector_t sector;
@@ -3549,29 +3544,9 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
(*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
sh->dev[dd_idx].sector);
- if (conf->mddev->bitmap && firstwrite) {
- /* Cannot hold spinlock over bitmap_startwrite,
- * but must ensure this isn't added to a batch until
- * we have added to the bitmap and set bm_seq.
- * So set STRIPE_BITMAP_PENDING to prevent
- * batching.
- * If multiple __add_stripe_bio() calls race here they
- * much all set STRIPE_BITMAP_PENDING. So only the first one
- * to complete "bitmap_startwrite" gets to set
- * STRIPE_BIT_DELAY. This is important as once a stripe
- * is added to a batch, STRIPE_BIT_DELAY cannot be changed
- * any more.
- */
- set_bit(STRIPE_BITMAP_PENDING, &sh->state);
- spin_unlock_irq(&sh->stripe_lock);
- md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf), 0);
- spin_lock_irq(&sh->stripe_lock);
- clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
- if (!sh->batch_head) {
- sh->bm_seq = conf->seq_flush+1;
- set_bit(STRIPE_BIT_DELAY, &sh->state);
- }
+ if (conf->mddev->bitmap && firstwrite && !sh->batch_head) {
+ sh->bm_seq = conf->seq_flush+1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
}
}
@@ -3622,7 +3597,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
for (i = disks; i--; ) {
struct bio *bi;
- int bitmap_end = 0;
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
struct md_rdev *rdev = conf->disks[i].rdev;
@@ -3647,13 +3621,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].towrite = NULL;
sh->overwrite_disks = 0;
spin_unlock_irq(&sh->stripe_lock);
- if (bi)
- bitmap_end = 1;
log_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[i].flags, R5_Overlap);
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
@@ -3663,10 +3635,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bio_io_error(bi);
bi = nextbi;
}
- if (bitmap_end)
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf), 0, 0);
- bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
@@ -3675,7 +3643,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].page = sh->dev[i].orig_page;
}
- if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
@@ -3697,7 +3664,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].toread = NULL;
spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[i].flags, R5_Overlap);
if (bi)
s->to_read--;
while (bi && bi->bi_iter.bi_sector <
@@ -3709,9 +3676,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi = nextbi;
}
}
- if (bitmap_end)
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf), 0, 0);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -3735,7 +3699,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
s->syncing = 0;
s->replacing = 0;
/* There is nothing more to do for sync/check/repair.
@@ -4060,10 +4024,7 @@ returnbi:
bio_endio(wbi);
wbi = wbi2;
}
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- 0);
+
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
@@ -4340,7 +4301,6 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
s->locked++;
set_bit(R5_Wantwrite, &dev->flags);
- clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(STRIPE_INSYNC, &sh->state);
break;
case check_state_run:
@@ -4497,7 +4457,6 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
clear_bit(R5_Wantwrite, &dev->flags);
s->locked--;
}
- clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(STRIPE_INSYNC, &sh->state);
break;
@@ -4723,14 +4682,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev) {
is_bad = rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf));
- if (s->blocked_rdev == NULL
- && (test_bit(Blocked, &rdev->flags)
- || is_bad < 0)) {
+ if (s->blocked_rdev == NULL) {
if (is_bad < 0)
- set_bit(BlockedBadBlocks,
- &rdev->flags);
- s->blocked_rdev = rdev;
- atomic_inc(&rdev->nr_pending);
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ if (rdev_blocked(rdev)) {
+ s->blocked_rdev = rdev;
+ atomic_inc(&rdev->nr_pending);
+ }
}
}
clear_bit(R5_Insync, &dev->flags);
@@ -4876,7 +4834,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
{
struct stripe_head *sh, *next;
int i;
- int do_wakeup = 0;
list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
@@ -4892,8 +4849,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
(1 << STRIPE_COMPUTE_RUN) |
(1 << STRIPE_DISCARD) |
(1 << STRIPE_BATCH_READY) |
- (1 << STRIPE_BATCH_ERR) |
- (1 << STRIPE_BITMAP_PENDING)),
+ (1 << STRIPE_BATCH_ERR)),
"stripe state: %lx\n", sh->state);
WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
(1 << STRIPE_REPLACED)),
@@ -4901,7 +4857,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
(1 << STRIPE_PREREAD_ACTIVE) |
- (1 << STRIPE_DEGRADED) |
(1 << STRIPE_ON_UNPLUG_LIST)),
head_sh->state & (1 << STRIPE_INSYNC));
@@ -4912,7 +4867,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
spin_unlock_irq(&sh->stripe_lock);
for (i = 0; i < sh->disks; i++) {
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- do_wakeup = 1;
+ wake_up_bit(&sh->dev[i].flags, R5_Overlap);
sh->dev[i].flags = head_sh->dev[i].flags &
(~((1 << R5_WriteError) | (1 << R5_Overlap)));
}
@@ -4926,12 +4881,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
spin_unlock_irq(&head_sh->stripe_lock);
for (i = 0; i < head_sh->disks; i++)
if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
- do_wakeup = 1;
+ wake_up_bit(&head_sh->dev[i].flags, R5_Overlap);
if (head_sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &head_sh->state);
-
- if (do_wakeup)
- wake_up(&head_sh->raid_conf->wait_for_overlap);
}
static void handle_stripe(struct stripe_head *sh)
@@ -5197,7 +5149,7 @@ static void handle_stripe(struct stripe_head *sh)
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
}
/* If the failed drives are just a ReadError, then we might need
@@ -5260,7 +5212,7 @@ static void handle_stripe(struct stripe_head *sh)
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
}
@@ -5754,12 +5706,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
int d;
again:
sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0);
- prepare_to_wait(&conf->wait_for_overlap, &w,
- TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
raid5_release_stripe(sh);
- schedule();
+ wait_on_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap,
+ TASK_UNINTERRUPTIBLE);
goto again;
}
clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
@@ -5771,12 +5722,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
set_bit(R5_Overlap, &sh->dev[d].flags);
spin_unlock_irq(&sh->stripe_lock);
raid5_release_stripe(sh);
- schedule();
+ wait_on_bit(&sh->dev[d].flags, R5_Overlap,
+ TASK_UNINTERRUPTIBLE);
goto again;
}
}
set_bit(STRIPE_DISCARD, &sh->state);
- finish_wait(&conf->wait_for_overlap, &w);
sh->overwrite_disks = 0;
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
@@ -5789,13 +5740,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
- for (d = 0;
- d < conf->raid_disks - conf->max_degraded;
- d++)
- md_bitmap_startwrite(mddev->bitmap,
- sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- 0);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
@@ -5856,7 +5800,6 @@ static int add_all_stripe_bios(struct r5conf *conf,
struct bio *bi, int forwrite, int previous)
{
int dd_idx;
- int ret = 1;
spin_lock_irq(&sh->stripe_lock);
@@ -5872,14 +5815,19 @@ static int add_all_stripe_bios(struct r5conf *conf,
if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
set_bit(R5_Overlap, &dev->flags);
- ret = 0;
- continue;
+ spin_unlock_irq(&sh->stripe_lock);
+ raid5_release_stripe(sh);
+ /* release batch_last before wait to avoid risk of deadlock */
+ if (ctx->batch_last) {
+ raid5_release_stripe(ctx->batch_last);
+ ctx->batch_last = NULL;
+ }
+ md_wakeup_thread(conf->mddev->thread);
+ wait_on_bit(&dev->flags, R5_Overlap, TASK_UNINTERRUPTIBLE);
+ return 0;
}
}
- if (!ret)
- goto out;
-
for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
struct r5dev *dev = &sh->dev[dd_idx];
@@ -5895,9 +5843,89 @@ static int add_all_stripe_bios(struct r5conf *conf,
RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
}
-out:
spin_unlock_irq(&sh->stripe_lock);
- return ret;
+ return 1;
+}
+
+enum reshape_loc {
+ LOC_NO_RESHAPE,
+ LOC_AHEAD_OF_RESHAPE,
+ LOC_INSIDE_RESHAPE,
+ LOC_BEHIND_RESHAPE,
+};
+
+static enum reshape_loc get_reshape_loc(struct mddev *mddev,
+ struct r5conf *conf, sector_t logical_sector)
+{
+ sector_t reshape_progress, reshape_safe;
+ /*
+ * Spinlock is needed as reshape_progress may be
+ * 64bit on a 32bit platform, and so it might be
+ * possible to see a half-updated value
+ * Of course reshape_progress could change after
+ * the lock is dropped, so once we get a reference
+ * to the stripe that we think it is, we will have
+ * to check again.
+ */
+ spin_lock_irq(&conf->device_lock);
+ reshape_progress = conf->reshape_progress;
+ reshape_safe = conf->reshape_safe;
+ spin_unlock_irq(&conf->device_lock);
+ if (reshape_progress == MaxSector)
+ return LOC_NO_RESHAPE;
+ if (ahead_of_reshape(mddev, logical_sector, reshape_progress))
+ return LOC_AHEAD_OF_RESHAPE;
+ if (ahead_of_reshape(mddev, logical_sector, reshape_safe))
+ return LOC_INSIDE_RESHAPE;
+ return LOC_BEHIND_RESHAPE;
+}
+
+static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset,
+ unsigned long *sectors)
+{
+ struct r5conf *conf = mddev->private;
+ sector_t start = *offset;
+ sector_t end = start + *sectors;
+ sector_t prev_start = start;
+ sector_t prev_end = end;
+ int sectors_per_chunk;
+ enum reshape_loc loc;
+ int dd_idx;
+
+ sectors_per_chunk = conf->chunk_sectors *
+ (conf->raid_disks - conf->max_degraded);
+ start = round_down(start, sectors_per_chunk);
+ end = round_up(end, sectors_per_chunk);
+
+ start = raid5_compute_sector(conf, start, 0, &dd_idx, NULL);
+ end = raid5_compute_sector(conf, end, 0, &dd_idx, NULL);
+
+ /*
+ * For LOC_INSIDE_RESHAPE, this IO will wait for reshape to make
+ * progress, hence it's the same as LOC_BEHIND_RESHAPE.
+ */
+ loc = get_reshape_loc(mddev, conf, prev_start);
+ if (likely(loc != LOC_AHEAD_OF_RESHAPE)) {
+ *offset = start;
+ *sectors = end - start;
+ return;
+ }
+
+ sectors_per_chunk = conf->prev_chunk_sectors *
+ (conf->previous_raid_disks - conf->max_degraded);
+ prev_start = round_down(prev_start, sectors_per_chunk);
+ prev_end = round_down(prev_end, sectors_per_chunk);
+
+ prev_start = raid5_compute_sector(conf, prev_start, 1, &dd_idx, NULL);
+ prev_end = raid5_compute_sector(conf, prev_end, 1, &dd_idx, NULL);
+
+ /*
+ * for LOC_AHEAD_OF_RESHAPE, reshape can make progress before this IO
+ * is handled in make_stripe_request(), we can't know this here hence
+ * we set bits for both.
+ */
+ *offset = min(start, prev_start);
+ *sectors = max(end, prev_end) - *offset;
}
static enum stripe_result make_stripe_request(struct mddev *mddev,
@@ -5914,28 +5942,14 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
seq = read_seqcount_begin(&conf->gen_lock);
if (unlikely(conf->reshape_progress != MaxSector)) {
- /*
- * Spinlock is needed as reshape_progress may be
- * 64bit on a 32bit platform, and so it might be
- * possible to see a half-updated value
- * Of course reshape_progress could change after
- * the lock is dropped, so once we get a reference
- * to the stripe that we think it is, we will have
- * to check again.
- */
- spin_lock_irq(&conf->device_lock);
- if (ahead_of_reshape(mddev, logical_sector,
- conf->reshape_progress)) {
- previous = 1;
- } else {
- if (ahead_of_reshape(mddev, logical_sector,
- conf->reshape_safe)) {
- spin_unlock_irq(&conf->device_lock);
- ret = STRIPE_SCHEDULE_AND_RETRY;
- goto out;
- }
+ enum reshape_loc loc = get_reshape_loc(mddev, conf,
+ logical_sector);
+ if (loc == LOC_INSIDE_RESHAPE) {
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out;
}
- spin_unlock_irq(&conf->device_lock);
+ if (loc == LOC_AHEAD_OF_RESHAPE)
+ previous = 1;
}
new_sector = raid5_compute_sector(conf, logical_sector, previous,
@@ -5974,17 +5988,17 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
goto out_release;
}
- if (test_bit(STRIPE_EXPANDING, &sh->state) ||
- !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
- /*
- * Stripe is busy expanding or add failed due to
- * overlap. Flush everything and wait a while.
- */
+ if (test_bit(STRIPE_EXPANDING, &sh->state)) {
md_wakeup_thread(mddev->thread);
ret = STRIPE_SCHEDULE_AND_RETRY;
goto out_release;
}
+ if (!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
+ ret = STRIPE_RETRY;
+ goto out;
+ }
+
if (stripe_can_batch(sh)) {
stripe_add_to_batch_list(conf, sh, ctx->batch_last);
if (ctx->batch_last)
@@ -6055,6 +6069,7 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ bool on_wq;
struct r5conf *conf = mddev->private;
sector_t logical_sector;
struct stripe_request_ctx ctx = {};
@@ -6079,8 +6094,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
}
- if (!md_write_start(mddev, bi))
- return false;
+ md_write_start(mddev, bi);
/*
* If array is degraded, better not do chunk aligned read because
* later we might have to read it again in order to reconstruct
@@ -6114,8 +6128,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
/* Bail out if conflicts with reshape and REQ_NOWAIT is set */
if ((bi->bi_opf & REQ_NOWAIT) &&
(conf->reshape_progress != MaxSector) &&
- !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) &&
- ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) {
+ get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) {
bio_wouldblock_error(bi);
if (rw == WRITE)
md_write_end(mddev);
@@ -6130,11 +6143,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
* sequential IO pattern. We don't bother with the optimization when
* reshaping as the performance benefit is not worth the complexity.
*/
- if (likely(conf->reshape_progress == MaxSector))
+ if (likely(conf->reshape_progress == MaxSector)) {
logical_sector = raid5_bio_lowest_chunk_sector(conf, bi);
+ on_wq = false;
+ } else {
+ add_wait_queue(&conf->wait_for_reshape, &wait);
+ on_wq = true;
+ }
s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
- add_wait_queue(&conf->wait_for_overlap, &wait);
while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi);
@@ -6145,6 +6162,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
continue;
if (res == STRIPE_SCHEDULE_AND_RETRY) {
+ WARN_ON_ONCE(!on_wq);
/*
* Must release the reference to batch_last before
* scheduling and waiting for work to be done,
@@ -6169,7 +6187,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
logical_sector = ctx.first_sector +
(s << RAID5_STRIPE_SHIFT(conf));
}
- remove_wait_queue(&conf->wait_for_overlap, &wait);
+ if (unlikely(on_wq))
+ remove_wait_queue(&conf->wait_for_reshape, &wait);
if (ctx.batch_last)
raid5_release_stripe(ctx.batch_last);
@@ -6256,7 +6275,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
if (mddev->reshape_backwards) {
- BUG_ON(writepos < reshape_sectors);
+ if (WARN_ON(writepos < reshape_sectors))
+ return MaxSector;
+
writepos -= reshape_sectors;
readpos += reshape_sectors;
safepos += reshape_sectors;
@@ -6274,14 +6295,18 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
* to set 'stripe_addr' which is where we will write to.
*/
if (mddev->reshape_backwards) {
- BUG_ON(conf->reshape_progress == 0);
+ if (WARN_ON(conf->reshape_progress == 0))
+ return MaxSector;
+
stripe_addr = writepos;
- BUG_ON((mddev->dev_sectors &
- ~((sector_t)reshape_sectors - 1))
- - reshape_sectors - stripe_addr
- != sector_nr);
+ if (WARN_ON((mddev->dev_sectors &
+ ~((sector_t)reshape_sectors - 1)) -
+ reshape_sectors - stripe_addr != sector_nr))
+ return MaxSector;
} else {
- BUG_ON(writepos != sector_nr + reshape_sectors);
+ if (WARN_ON(writepos != sector_nr + reshape_sectors))
+ return MaxSector;
+
stripe_addr = sector_nr;
}
@@ -6316,7 +6341,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
: (safepos < writepos && readpos > writepos)) ||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
/* Cannot proceed until we've updated the superblock... */
- wait_event(conf->wait_for_overlap,
+ wait_event(conf->wait_for_reshape,
atomic_read(&conf->reshape_stripes)==0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
@@ -6342,7 +6367,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
@@ -6425,7 +6450,7 @@ finish:
(sector_nr - mddev->curr_resync_completed) * 2
>= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
- wait_event(conf->wait_for_overlap,
+ wait_event(conf->wait_for_reshape,
atomic_read(&conf->reshape_stripes) == 0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
@@ -6451,7 +6476,7 @@ finish:
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
ret:
@@ -6459,13 +6484,12 @@ ret:
}
static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
- int *skipped)
+ sector_t max_sector, int *skipped)
{
struct r5conf *conf = mddev->private;
struct stripe_head *sh;
- sector_t max_sector = mddev->dev_sectors;
sector_t sync_blocks;
- int still_degraded = 0;
+ bool still_degraded = false;
int i;
if (sector_nr >= max_sector) {
@@ -6477,17 +6501,17 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (mddev->curr_resync < max_sector) /* aborted */
- md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
- &sync_blocks, 1);
+ mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else /* completed sync */
conf->fullsync = 0;
- md_bitmap_close_sync(mddev->bitmap);
+ mddev->bitmap_ops->close_sync(mddev);
return 0;
}
/* Allow raid5_quiesce to complete */
- wait_event(conf->wait_for_overlap, conf->quiesce != 2);
+ wait_event(conf->wait_for_reshape, conf->quiesce != 2);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
@@ -6510,7 +6534,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
- !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
+ true) &&
sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
@@ -6519,7 +6544,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
- md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
sh = raid5_get_active_stripe(conf, NULL, sector_nr,
R5_GAS_NOBLOCK);
@@ -6538,10 +6563,11 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
struct md_rdev *rdev = conf->disks[i].rdev;
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
- still_degraded = 1;
+ still_degraded = true;
}
- md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
+ mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
+ still_degraded);
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -6734,6 +6760,9 @@ static void raid5d(struct md_thread *thread)
int batch_size, released;
unsigned int offset;
+ if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+ break;
+
released = release_stripe_list(conf, conf->temp_inactive_list);
if (released)
clear_bit(R5_DID_ALLOC, &conf->cache_state);
@@ -6743,7 +6772,7 @@ static void raid5d(struct md_thread *thread)
/* Now is a good time to flush some bitmap updates */
conf->seq_flush++;
spin_unlock_irq(&conf->device_lock);
- md_bitmap_unplug(mddev->bitmap);
+ mddev->bitmap_ops->unplug(mddev, true);
spin_lock_irq(&conf->device_lock);
conf->seq_write = conf->seq_flush;
activate_bit_delay(conf, conf->temp_inactive_list);
@@ -6770,18 +6799,7 @@ static void raid5d(struct md_thread *thread)
spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev);
spin_lock_irq(&conf->device_lock);
-
- /*
- * Waiting on MD_SB_CHANGE_PENDING below may deadlock
- * seeing md_check_recovery() is needed to clear
- * the flag when using mdmon.
- */
- continue;
}
-
- wait_event_lock_irq(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
- conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@@ -7091,12 +7109,14 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
err = -ENODEV;
else if (new != conf->skip_copy) {
struct request_queue *q = mddev->gendisk->queue;
+ struct queue_limits lim = queue_limits_start_update(q);
conf->skip_copy = new;
if (new)
- blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
+ lim.features |= BLK_FEAT_STABLE_WRITES;
else
- blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
+ lim.features &= ~BLK_FEAT_STABLE_WRITES;
+ err = queue_limits_commit_update(q, &lim);
}
mddev_unlock_and_resume(mddev);
return err ?: len;
@@ -7156,6 +7176,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
err = mddev_suspend_and_lock(mddev);
if (err)
return err;
+ raid5_quiesce(mddev, true);
+
conf = mddev->private;
if (!conf)
err = -ENODEV;
@@ -7177,6 +7199,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
kfree(old_groups);
}
}
+
+ raid5_quiesce(mddev, false);
mddev_unlock_and_resume(mddev);
return err ?: len;
@@ -7477,7 +7501,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
- init_waitqueue_head(&conf->wait_for_overlap);
+ init_waitqueue_head(&conf->wait_for_reshape);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->loprio_list);
INIT_LIST_HEAD(&conf->hold_list);
@@ -7571,11 +7595,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (test_bit(Replacement, &rdev->flags)) {
if (disk->replacement)
goto abort;
- RCU_INIT_POINTER(disk->replacement, rdev);
+ disk->replacement = rdev;
} else {
if (disk->rdev)
goto abort;
- RCU_INIT_POINTER(disk->rdev, rdev);
+ disk->rdev = rdev;
}
if (test_bit(In_sync, &rdev->flags)) {
@@ -7711,13 +7735,13 @@ static int raid5_set_limits(struct mddev *mddev)
*/
stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
- blk_set_stacking_limits(&lim);
+ md_init_stacking_limits(&lim);
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
- lim.raid_partial_stripes_expensive = 1;
+ lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE;
lim.discard_granularity = stripe;
lim.max_write_zeroes_sectors = 0;
- mddev_stack_rdev_limits(mddev, &lim);
+ mddev_stack_rdev_limits(mddev, &lim, 0);
rdev_for_each(rdev, mddev)
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset,
mddev->gendisk->disk_name);
@@ -8057,7 +8081,7 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
seq_printf (seq, "]");
}
-static void print_raid5_conf (struct r5conf *conf)
+static void print_raid5_conf(struct r5conf *conf)
{
struct md_rdev *rdev;
int i;
@@ -8071,15 +8095,13 @@ static void print_raid5_conf (struct r5conf *conf)
conf->raid_disks,
conf->raid_disks - conf->mddev->degraded);
- rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- rdev = rcu_dereference(conf->disks[i].rdev);
+ rdev = conf->disks[i].rdev;
if (rdev)
pr_debug(" disk %d, o:%d, dev:%pg\n",
i, !test_bit(Faulty, &rdev->flags),
rdev->bdev);
}
- rcu_read_unlock();
}
static int raid5_spare_active(struct mddev *mddev)
@@ -8299,6 +8321,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
*/
sector_t newsize;
struct r5conf *conf = mddev->private;
+ int ret;
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return -EINVAL;
@@ -8307,11 +8330,11 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
- if (mddev->bitmap) {
- int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
- if (ret)
- return ret;
- }
+
+ ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false);
+ if (ret)
+ return ret;
+
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
@@ -8537,7 +8560,7 @@ static void end_reshape(struct r5conf *conf)
!test_bit(In_sync, &rdev->flags))
rdev->recovery_offset = MaxSector;
spin_unlock_irq(&conf->device_lock);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
mddev_update_io_opt(conf->mddev,
conf->raid_disks - conf->max_degraded);
@@ -8601,13 +8624,13 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
conf->quiesce = 1;
unlock_all_device_hash_locks_irq(conf);
/* allow reshape to continue */
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
} else {
/* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
wake_up(&conf->wait_for_quiescent);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
unlock_all_device_hash_locks_irq(conf);
}
log_quiesce(conf, quiesce);
@@ -8926,7 +8949,7 @@ static void raid5_prepare_suspend(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
}
static struct md_personality raid6_personality =
@@ -8953,6 +8976,7 @@ static struct md_personality raid6_personality =
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static struct md_personality raid5_personality =
{
@@ -8978,6 +9002,7 @@ static struct md_personality raid5_personality =
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static struct md_personality raid4_personality =
@@ -9004,6 +9029,7 @@ static struct md_personality raid4_personality =
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static int __init raid5_init(void)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9b5a7dc3f2a0..eafc6e9ed6ee 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -358,7 +358,6 @@ enum {
STRIPE_REPLACED,
STRIPE_PREREAD_ACTIVE,
STRIPE_DELAYED,
- STRIPE_DEGRADED,
STRIPE_BIT_DELAY,
STRIPE_EXPANDING,
STRIPE_EXPAND_SOURCE,
@@ -372,9 +371,6 @@ enum {
STRIPE_ON_RELEASE_LIST,
STRIPE_BATCH_READY,
STRIPE_BATCH_ERR,
- STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
- * to batch yet.
- */
STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
* this bit is used in two scenarios:
*
@@ -633,7 +629,7 @@ struct r5conf {
* two caches.
*/
int active_name;
- char cache_name[2][32];
+ char cache_name[2][48];
struct kmem_cache *slab_cache; /* for allocating stripes */
struct mutex cache_size_mutex; /* Protect changes to cache size */
@@ -668,7 +664,7 @@ struct r5conf {
struct llist_head released_stripes;
wait_queue_head_t wait_for_quiescent;
wait_queue_head_t wait_for_stripe;
- wait_queue_head_t wait_for_overlap;
+ wait_queue_head_t wait_for_reshape;
unsigned long cache_state;
struct shrinker *shrinker;
int pool_size; /* number of disks in stripeheads in pool */