summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bcache.h24
-rw-r--r--drivers/md/bcache/bset.c63
-rw-r--r--drivers/md/bcache/btree.c63
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/closure.c13
-rw-r--r--drivers/md/bcache/closure.h4
-rw-r--r--drivers/md/bcache/debug.c17
-rw-r--r--drivers/md/bcache/journal.c1
-rw-r--r--drivers/md/bcache/request.c75
-rw-r--r--drivers/md/bcache/super.c59
-rw-r--r--drivers/md/bcache/sysfs.c48
-rw-r--r--drivers/md/bcache/util.c2
-rw-r--r--drivers/md/bcache/util.h2
-rw-r--r--drivers/md/bcache/writeback.c125
-rw-r--r--drivers/md/bcache/writeback.h19
-rw-r--r--drivers/md/dm-raid.c2
-rw-r--r--drivers/md/dm-table.c7
-rw-r--r--drivers/md/dm-thin-metadata.c9
-rw-r--r--drivers/md/dm-thin.c11
-rw-r--r--drivers/md/dm-writecache.c53
-rw-r--r--drivers/md/dm-zoned-target.c2
-rw-r--r--drivers/md/dm.c14
-rw-r--r--drivers/md/md-cluster.c47
-rw-r--r--drivers/md/md.c37
-rw-r--r--drivers/md/md.h1
-rw-r--r--drivers/md/raid10.c7
-rw-r--r--drivers/md/raid5-cache.c2
-rw-r--r--drivers/md/raid5.c12
28 files changed, 521 insertions, 200 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d6bf294f3907..05f82ff6f016 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -328,13 +328,6 @@ struct cached_dev {
*/
atomic_t has_dirty;
- /*
- * Set to zero by things that touch the backing volume-- except
- * writeback. Incremented by writeback. Used to determine when to
- * accelerate idle writeback.
- */
- atomic_t backing_idle;
-
struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update;
@@ -423,9 +416,9 @@ struct cache {
/*
* When allocating new buckets, prio_write() gets first dibs - since we
* may not be allocate at all without writing priorities and gens.
- * prio_buckets[] contains the last buckets we wrote priorities to (so
- * gc can mark them as metadata), prio_next[] contains the buckets
- * allocated for the next prio write.
+ * prio_last_buckets[] contains the last buckets we wrote priorities to
+ * (so gc can mark them as metadata), prio_buckets[] contains the
+ * buckets allocated for the next prio write.
*/
uint64_t *prio_buckets;
uint64_t *prio_last_buckets;
@@ -474,6 +467,7 @@ struct cache {
struct gc_stat {
size_t nodes;
+ size_t nodes_pre;
size_t key_bytes;
size_t nkeys;
@@ -514,6 +508,8 @@ struct cache_set {
struct cache_accounting accounting;
unsigned long flags;
+ atomic_t idle_counter;
+ atomic_t at_max_writeback_rate;
struct cache_sb sb;
@@ -523,8 +519,10 @@ struct cache_set {
struct bcache_device **devices;
unsigned devices_max_used;
+ atomic_t attached_dev_nr;
struct list_head cached_devs;
uint64_t cached_dev_sectors;
+ atomic_long_t flash_dev_dirty_sectors;
struct closure caching;
struct closure sb_write;
@@ -603,6 +601,10 @@ struct cache_set {
*/
atomic_t rescale;
/*
+ * used for GC, identify if any front side I/Os is inflight
+ */
+ atomic_t search_inflight;
+ /*
* When we invalidate buckets, we use both the priority and the amount
* of good data to determine which buckets to reuse first - to weight
* those together consistently we keep track of the smallest nonzero
@@ -995,7 +997,7 @@ void bch_open_buckets_free(struct cache_set *);
int bch_cache_allocator_start(struct cache *ca);
void bch_debug_exit(void);
-int bch_debug_init(struct kobject *);
+void bch_debug_init(struct kobject *kobj);
void bch_request_exit(void);
int bch_request_init(void);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index f3403b45bc28..596c93b44e9b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -366,6 +366,10 @@ EXPORT_SYMBOL(bch_btree_keys_init);
/* Binary tree stuff for auxiliary search trees */
+/*
+ * return array index next to j when does in-order traverse
+ * of a binary tree which is stored in a linear array
+ */
static unsigned inorder_next(unsigned j, unsigned size)
{
if (j * 2 + 1 < size) {
@@ -379,6 +383,10 @@ static unsigned inorder_next(unsigned j, unsigned size)
return j;
}
+/*
+ * return array index previous to j when does in-order traverse
+ * of a binary tree which is stored in a linear array
+ */
static unsigned inorder_prev(unsigned j, unsigned size)
{
if (j * 2 < size) {
@@ -421,6 +429,10 @@ static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
return j;
}
+/*
+ * Return the cacheline index in bset_tree->data, where j is index
+ * from a linear array which stores the auxiliar binary tree
+ */
static unsigned to_inorder(unsigned j, struct bset_tree *t)
{
return __to_inorder(j, t->size, t->extra);
@@ -441,6 +453,10 @@ static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
return j;
}
+/*
+ * Return an index from a linear array which stores the auxiliar binary
+ * tree, j is the cacheline index of t->data.
+ */
static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
{
return __inorder_to_tree(j, t->size, t->extra);
@@ -546,6 +562,20 @@ static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
return low;
}
+/*
+ * Calculate mantissa value for struct bkey_float.
+ * If most significant bit of f->exponent is not set, then
+ * - f->exponent >> 6 is 0
+ * - p[0] points to bkey->low
+ * - p[-1] borrows bits from KEY_INODE() of bkey->high
+ * if most isgnificant bits of f->exponent is set, then
+ * - f->exponent >> 6 is 1
+ * - p[0] points to bits from KEY_INODE() of bkey->high
+ * - p[-1] points to other bits from KEY_INODE() of
+ * bkey->high too.
+ * See make_bfloat() to check when most significant bit of f->exponent
+ * is set or not.
+ */
static inline unsigned bfloat_mantissa(const struct bkey *k,
struct bkey_float *f)
{
@@ -570,6 +600,16 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
BUG_ON(m < l || m > r);
BUG_ON(bkey_next(p) != m);
+ /*
+ * If l and r have different KEY_INODE values (different backing
+ * device), f->exponent records how many least significant bits
+ * are different in KEY_INODE values and sets most significant
+ * bits to 1 (by +64).
+ * If l and r have same KEY_INODE value, f->exponent records
+ * how many different bits in least significant bits of bkey->low.
+ * See bfloat_mantiss() how the most significant bit of
+ * f->exponent is used to calculate bfloat mantissa value.
+ */
if (KEY_INODE(l) != KEY_INODE(r))
f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
else
@@ -633,6 +673,15 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
}
EXPORT_SYMBOL(bch_bset_init_next);
+/*
+ * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to
+ * accelerate bkey search in a btree node (pointed by bset_tree->data in
+ * memory). After search in the auxiliar tree by calling bset_search_tree(),
+ * a struct bset_search_iter is returned which indicates range [l, r] from
+ * bset_tree->data where the searching bkey might be inside. Then a followed
+ * linear comparison does the exact search, see __bch_bset_search() for how
+ * the auxiliary tree is used.
+ */
void bch_bset_build_written_tree(struct btree_keys *b)
{
struct bset_tree *t = bset_tree_last(b);
@@ -898,6 +947,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
unsigned inorder, j, n = 1;
do {
+ /*
+ * A bit trick here.
+ * If p < t->size, (int)(p - t->size) is a minus value and
+ * the most significant bit is set, right shifting 31 bits
+ * gets 1. If p >= t->size, the most significant bit is
+ * not set, right shifting 31 bits gets 0.
+ * So the following 2 lines equals to
+ * if (p >= t->size)
+ * p = 0;
+ * but a branch instruction is avoided.
+ */
unsigned p = n << 4;
p &= ((int) (p - t->size)) >> 31;
@@ -907,6 +967,9 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
f = &t->tree[j];
/*
+ * Similar bit trick, use subtract operation to avoid a branch
+ * instruction.
+ *
* n = (f->mantissa > bfloat_mantissa())
* ? j * 2
* : j * 2 + 1;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 547c9eedc2f4..c19f7716df88 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -90,6 +90,9 @@
#define MAX_NEED_GC 64
#define MAX_SAVE_PRIO 72
+#define MAX_GC_TIMES 100
+#define MIN_GC_NODES 100
+#define GC_SLEEP_MS 100
#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
@@ -1008,6 +1011,13 @@ retry:
BUG_ON(b->level != level);
}
+ if (btree_node_io_error(b)) {
+ rw_unlock(write, b);
+ return ERR_PTR(-EIO);
+ }
+
+ BUG_ON(!b->written);
+
b->parent = parent;
b->accessed = 1;
@@ -1019,13 +1029,6 @@ retry:
for (; i <= b->keys.nsets; i++)
prefetch(b->keys.set[i].data);
- if (btree_node_io_error(b)) {
- rw_unlock(write, b);
- return ERR_PTR(-EIO);
- }
-
- BUG_ON(!b->written);
-
return b;
}
@@ -1520,6 +1523,32 @@ static unsigned btree_gc_count_keys(struct btree *b)
return ret;
}
+static size_t btree_gc_min_nodes(struct cache_set *c)
+{
+ size_t min_nodes;
+
+ /*
+ * Since incremental GC would stop 100ms when front
+ * side I/O comes, so when there are many btree nodes,
+ * if GC only processes constant (100) nodes each time,
+ * GC would last a long time, and the front side I/Os
+ * would run out of the buckets (since no new bucket
+ * can be allocated during GC), and be blocked again.
+ * So GC should not process constant nodes, but varied
+ * nodes according to the number of btree nodes, which
+ * realized by dividing GC into constant(100) times,
+ * so when there are many btree nodes, GC can process
+ * more nodes each time, otherwise, GC will process less
+ * nodes each time (but no less than MIN_GC_NODES)
+ */
+ min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
+ if (min_nodes < MIN_GC_NODES)
+ min_nodes = MIN_GC_NODES;
+
+ return min_nodes;
+}
+
+
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc)
{
@@ -1585,6 +1614,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
r->b = NULL;
+ if (atomic_read(&b->c->search_inflight) &&
+ gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+ gc->nodes_pre = gc->nodes;
+ ret = -EAGAIN;
+ break;
+ }
+
if (need_resched()) {
ret = -EAGAIN;
break;
@@ -1753,7 +1789,10 @@ static void bch_btree_gc(struct cache_set *c)
closure_sync(&writes);
cond_resched();
- if (ret && ret != -EAGAIN)
+ if (ret == -EAGAIN)
+ schedule_timeout_interruptible(msecs_to_jiffies
+ (GC_SLEEP_MS));
+ else if (ret)
pr_warn("gc failed!");
} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -1834,8 +1873,14 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
do {
k = bch_btree_iter_next_filter(&iter, &b->keys,
bch_ptr_bad);
- if (k)
+ if (k) {
btree_node_prefetch(b, k);
+ /*
+ * initiallize c->gc_stats.nodes
+ * for incremental GC
+ */
+ b->c->gc_stats.nodes++;
+ }
if (p)
ret = btree(check_recurse, p, b, op);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index d211e2c25b6b..68e9d926134d 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -152,7 +152,7 @@ static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
\
static inline void set_btree_node_ ## flag(struct btree *b) \
-{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
+{ set_bit(BTREE_NODE_ ## flag, &b->flags); }
enum btree_flags {
BTREE_NODE_io_error,
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 0e14969182c6..618253683d40 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -199,11 +199,16 @@ static const struct file_operations debug_ops = {
.release = single_release
};
-int __init closure_debug_init(void)
+void __init closure_debug_init(void)
{
- closure_debug = debugfs_create_file("closures",
- 0400, bcache_debug, NULL, &debug_ops);
- return IS_ERR_OR_NULL(closure_debug);
+ if (!IS_ERR_OR_NULL(bcache_debug))
+ /*
+ * it is unnecessary to check return value of
+ * debugfs_create_file(), we should not care
+ * about this.
+ */
+ closure_debug = debugfs_create_file(
+ "closures", 0400, bcache_debug, NULL, &debug_ops);
}
#endif
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 71427eb5fdae..7c2c5bc7c88b 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -186,13 +186,13 @@ static inline void closure_sync(struct closure *cl)
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-int closure_debug_init(void);
+void closure_debug_init(void);
void closure_debug_create(struct closure *cl);
void closure_debug_destroy(struct closure *cl);
#else
-static inline int closure_debug_init(void) { return 0; }
+static inline void closure_debug_init(void) {}
static inline void closure_debug_create(struct closure *cl) {}
static inline void closure_debug_destroy(struct closure *cl) {}
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index d030ce3025a6..12034c07257b 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -110,11 +110,15 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
struct bio_vec bv, cbv;
struct bvec_iter iter, citer = { 0 };
- check = bio_clone_kmalloc(bio, GFP_NOIO);
+ check = bio_kmalloc(GFP_NOIO, bio_segments(bio));
if (!check)
return;
+ check->bi_disk = bio->bi_disk;
check->bi_opf = REQ_OP_READ;
+ check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
+ check->bi_iter.bi_size = bio->bi_iter.bi_size;
+ bch_bio_map(check, NULL);
if (bch_bio_alloc_pages(check, GFP_NOIO))
goto out_put;
@@ -248,11 +252,12 @@ void bch_debug_exit(void)
debugfs_remove_recursive(bcache_debug);
}
-int __init bch_debug_init(struct kobject *kobj)
+void __init bch_debug_init(struct kobject *kobj)
{
- if (!IS_ENABLED(CONFIG_DEBUG_FS))
- return 0;
-
+ /*
+ * it is unnecessary to check return value of
+ * debugfs_create_file(), we should not care
+ * about this.
+ */
bcache_debug = debugfs_create_dir("bcache", NULL);
- return IS_ERR_OR_NULL(bcache_debug);
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 18f1b5239620..10748c626a1d 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -828,6 +828,7 @@ void bch_journal_free(struct cache_set *c)
free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
free_fifo(&c->journal.pin);
+ free_heap(&c->flush_btree);
}
int bch_journal_alloc(struct cache_set *c)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ae67f5fa8047..7dbe8b6316a0 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -107,7 +107,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
/*
* The journalling code doesn't handle the case where the keys to insert
* is bigger than an empty write: If we just return -ENOMEM here,
- * bio_insert() and bio_invalidate() will insert the keys created so far
+ * bch_data_insert_keys() will insert the keys created so far
* and finish the rest when the keylist is empty.
*/
if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
@@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio)
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
- generic_end_io_acct(s->d->disk->queue,
- bio_data_dir(s->orig_bio),
+ generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio),
&s->d->disk->part0, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
@@ -702,6 +701,8 @@ static void search_free(struct closure *cl)
{
struct search *s = container_of(cl, struct search, cl);
+ atomic_dec(&s->d->c->search_inflight);
+
if (s->iop.bio)
bio_put(s->iop.bio);
@@ -719,6 +720,7 @@ static inline struct search *search_alloc(struct bio *bio,
closure_init(&s->cl, NULL);
do_bio_hook(s, bio, request_endio);
+ atomic_inc(&d->c->search_inflight);
s->orig_bio = bio;
s->cache_miss = NULL;
@@ -1062,8 +1064,7 @@ static void detached_dev_end_io(struct bio *bio)
bio->bi_end_io = ddip->bi_end_io;
bio->bi_private = ddip->bi_private;
- generic_end_io_acct(ddip->d->disk->queue,
- bio_data_dir(bio),
+ generic_end_io_acct(ddip->d->disk->queue, bio_op(bio),
&ddip->d->disk->part0, ddip->start_time);
if (bio->bi_status) {
@@ -1102,6 +1103,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
generic_make_request(bio);
}
+static void quit_max_writeback_rate(struct cache_set *c,
+ struct cached_dev *this_dc)
+{
+ int i;
+ struct bcache_device *d;
+ struct cached_dev *dc;
+
+ /*
+ * mutex bch_register_lock may compete with other parallel requesters,
+ * or attach/detach operations on other backing device. Waiting to
+ * the mutex lock may increase I/O request latency for seconds or more.
+ * To avoid such situation, if mutext_trylock() failed, only writeback
+ * rate of current cached device is set to 1, and __update_write_back()
+ * will decide writeback rate of other cached devices (remember now
+ * c->idle_counter is 0 already).
+ */
+ if (mutex_trylock(&bch_register_lock)) {
+ for (i = 0; i < c->devices_max_used; i++) {
+ if (!c->devices[i])
+ continue;
+
+ if (UUID_FLASH_ONLY(&c->uuids[i]))
+ continue;
+
+ d = c->devices[i];
+ dc = container_of(d, struct cached_dev, disk);
+ /*
+ * set writeback rate to default minimum value,
+ * then let update_writeback_rate() to decide the
+ * upcoming rate.
+ */
+ atomic_long_set(&dc->writeback_rate.rate, 1);
+ }
+ mutex_unlock(&bch_register_lock);
+ } else
+ atomic_long_set(&this_dc->writeback_rate.rate, 1);
+}
+
/* Cached devices - read & write stuff */
static blk_qc_t cached_dev_make_request(struct request_queue *q,
@@ -1119,8 +1158,25 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
return BLK_QC_T_NONE;
}
- atomic_set(&dc->backing_idle, 0);
- generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+ if (likely(d->c)) {
+ if (atomic_read(&d->c->idle_counter))
+ atomic_set(&d->c->idle_counter, 0);
+ /*
+ * If at_max_writeback_rate of cache set is true and new I/O
+ * comes, quit max writeback rate of all cached devices
+ * attached to this cache set, and set at_max_writeback_rate
+ * to false.
+ */
+ if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
+ atomic_set(&d->c->at_max_writeback_rate, 0);
+ quit_max_writeback_rate(d->c, dc);
+ }
+ }
+
+ generic_start_io_acct(q,
+ bio_op(bio),
+ bio_sectors(bio),
+ &d->disk->part0);
bio_set_dev(bio, dc->bdev);
bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1229,7 +1285,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
struct search *s;
struct closure *cl;
struct bcache_device *d = bio->bi_disk->private_data;
- int rw = bio_data_dir(bio);
if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
bio->bi_status = BLK_STS_IOERR;
@@ -1237,7 +1292,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
return BLK_QC_T_NONE;
}
- generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+ generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
s = search_alloc(bio, d);
cl = &s->cl;
@@ -1254,7 +1309,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
flash_dev_nodata,
bcache_wq);
return BLK_QC_T_NONE;
- } else if (rw) {
+ } else if (bio_data_dir(bio)) {
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
&KEY(d->id, bio->bi_iter.bi_sector, 0),
&KEY(d->id, bio_end_sector(bio), 0));
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fa4058e43202..55a37641aa95 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -181,7 +181,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
goto err;
}
- sb->last_mount = get_seconds();
+ sb->last_mount = (u32)ktime_get_real_seconds();
err = NULL;
get_page(bh->b_page);
@@ -696,12 +696,14 @@ static void bcache_device_detach(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
+ atomic_dec(&d->c->attached_dev_nr);
+
if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
struct uuid_entry *u = d->c->uuids + d->id;
SET_UUID_FLASH_ONLY(u, 0);
memcpy(u->uuid, invalid_uuid, 16);
- u->invalidated = cpu_to_le32(get_seconds());
+ u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
bch_uuid_write(d->c);
}
@@ -796,11 +798,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
return idx;
if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
- BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
- !(d->disk = alloc_disk(BCACHE_MINORS))) {
- ida_simple_remove(&bcache_device_idx, idx);
- return -ENOMEM;
- }
+ BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
+ goto err;
+
+ d->disk = alloc_disk(BCACHE_MINORS);
+ if (!d->disk)
+ goto err;
set_capacity(d->disk, sectors);
snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
@@ -834,6 +837,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
blk_queue_write_cache(q, true, true);
return 0;
+
+err:
+ ida_simple_remove(&bcache_device_idx, idx);
+ return -ENOMEM;
+
}
/* Cached device */
@@ -1027,7 +1035,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
uint8_t *set_uuid)
{
- uint32_t rtime = cpu_to_le32(get_seconds());
+ uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
struct uuid_entry *u;
struct cached_dev *exist_dc, *t;
@@ -1070,7 +1078,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
(BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
memcpy(u->uuid, invalid_uuid, 16);
- u->invalidated = cpu_to_le32(get_seconds());
+ u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
u = NULL;
}
@@ -1138,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
bch_cached_dev_run(dc);
bcache_device_link(&dc->disk, c, "bdev");
+ atomic_inc(&c->attached_dev_nr);
/* Allow the writeback thread to proceed */
up_write(&dc->writeback_lock);
@@ -1285,6 +1294,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
pr_info("registered backing device %s", dc->backing_dev_name);
list_add(&dc->list, &uncached_devices);
+ /* attach to a matched cache set if it exists */
list_for_each_entry(c, &bch_cache_sets, list)
bch_cached_dev_attach(dc, c, NULL);
@@ -1311,6 +1321,8 @@ static void flash_dev_free(struct closure *cl)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
mutex_lock(&bch_register_lock);
+ atomic_long_sub(bcache_dev_sectors_dirty(d),
+ &d->c->flash_dev_dirty_sectors);
bcache_device_free(d);
mutex_unlock(&bch_register_lock);
kobject_put(&d->kobj);
@@ -1390,7 +1402,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
get_random_bytes(u->uuid, 16);
memset(u->label, 0, 32);
- u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
+ u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
SET_UUID_FLASH_ONLY(u, 1);
u->sectors = size >> 9;
@@ -1687,6 +1699,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
c->devices_max_used = 0;
+ atomic_set(&c->attached_dev_nr, 0);
c->btree_pages = bucket_pages(c);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
@@ -1894,7 +1907,7 @@ static void run_cache_set(struct cache_set *c)
goto err;
closure_sync(&cl);
- c->sb.last_mount = get_seconds();
+ c->sb.last_mount = (u32)ktime_get_real_seconds();
bcache_write_super(c);
list_for_each_entry_safe(dc, t, &uncached_devices, list)
@@ -2163,8 +2176,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!try_module_get(THIS_MODULE))
return -EBUSY;
- if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
- !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
+ path = kstrndup(buffer, size, GFP_KERNEL);
+ if (!path)
+ goto err;
+
+ sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
+ if (!sb)
goto err;
err = "failed to open device";
@@ -2324,13 +2341,21 @@ static int __init bcache_init(void)
return bcache_major;
}
- if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
- !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
- bch_request_init() ||
- bch_debug_init(bcache_kobj) || closure_debug_init() ||
+ bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
+ if (!bcache_wq)
+ goto err;
+
+ bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
+ if (!bcache_kobj)
+ goto err;
+
+ if (bch_request_init() ||
sysfs_create_files(bcache_kobj, files))
goto err;
+ bch_debug_init(bcache_kobj);
+ closure_debug_init();
+
return 0;
err:
bcache_exit();
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 225b15aa0340..81d3520b0702 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -149,6 +149,7 @@ SHOW(__bch_cached_dev)
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
+ int wb = dc->writeback_running;
#define var(stat) (dc->stat)
@@ -170,7 +171,8 @@ SHOW(__bch_cached_dev)
var_printf(writeback_running, "%i");
var_print(writeback_delay);
var_print(writeback_percent);
- sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
+ sysfs_hprint(writeback_rate,
+ wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0);
sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
sysfs_printf(io_error_limit, "%i", dc->error_limit);
sysfs_printf(io_disable, "%i", dc->io_disable);
@@ -188,15 +190,22 @@ SHOW(__bch_cached_dev)
char change[20];
s64 next_io;
- bch_hprint(rate, dc->writeback_rate.rate << 9);
- bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
- bch_hprint(target, dc->writeback_rate_target << 9);
- bch_hprint(proportional,dc->writeback_rate_proportional << 9);
- bch_hprint(integral, dc->writeback_rate_integral_scaled << 9);
- bch_hprint(change, dc->writeback_rate_change << 9);
-
- next_io = div64_s64(dc->writeback_rate.next - local_clock(),
- NSEC_PER_MSEC);
+ /*
+ * Except for dirty and target, other values should
+ * be 0 if writeback is not running.
+ */
+ bch_hprint(rate,
+ wb ? atomic_long_read(&dc->writeback_rate.rate) << 9
+ : 0);
+ bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
+ bch_hprint(target, dc->writeback_rate_target << 9);
+ bch_hprint(proportional,
+ wb ? dc->writeback_rate_proportional << 9 : 0);
+ bch_hprint(integral,
+ wb ? dc->writeback_rate_integral_scaled << 9 : 0);
+ bch_hprint(change, wb ? dc->writeback_rate_change << 9 : 0);
+ next_io = wb ? div64_s64(dc->writeback_rate.next-local_clock(),
+ NSEC_PER_MSEC) : 0;
return sprintf(buf,
"rate:\t\t%s/sec\n"
@@ -255,8 +264,19 @@ STORE(__cached_dev)
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
- sysfs_strtoul_clamp(writeback_rate,
- dc->writeback_rate.rate, 1, INT_MAX);
+ if (attr == &sysfs_writeback_rate) {
+ ssize_t ret;
+ long int v = atomic_long_read(&dc->writeback_rate.rate);
+
+ ret = strtoul_safe_clamp(buf, v, 1, INT_MAX);
+
+ if (!ret) {
+ atomic_long_set(&dc->writeback_rate.rate, v);
+ ret = size;
+ }
+
+ return ret;
+ }
sysfs_strtoul_clamp(writeback_rate_update_seconds,
dc->writeback_rate_update_seconds,
@@ -338,8 +358,8 @@ STORE(__cached_dev)
if (!v)
return size;
}
-
- pr_err("Can't attach %s: cache set not found", buf);
+ if (v == -ENOENT)
+ pr_err("Can't attach %s: cache set not found", buf);
return v;
}
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index fc479b026d6d..b15256bcf0e7 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{
uint64_t now = local_clock();
- d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+ d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate));
/* Bound the time. Don't let us fall further than 2 seconds behind
* (this prevents unnecessary backlog that would make it impossible
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cced87f8eb27..f7b0133c9d2f 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -442,7 +442,7 @@ struct bch_ratelimit {
* Rate at which we want to do work, in units per second
* The units here correspond to the units passed to bch_next_delay()
*/
- uint32_t rate;
+ atomic_long_t rate;
};
static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ad45ebe1a74b..481d4cf38ac0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -27,7 +27,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
* flash-only devices
*/
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
- bcache_flash_devs_sectors_dirty(c);
+ atomic_long_read(&c->flash_dev_dirty_sectors);
/*
* Unfortunately there is no control of global dirty data. If the
@@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc)
dc->writeback_rate_proportional = proportional_scaled;
dc->writeback_rate_integral_scaled = integral_scaled;
- dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
- dc->writeback_rate.rate = new_rate;
+ dc->writeback_rate_change = new_rate -
+ atomic_long_read(&dc->writeback_rate.rate);
+ atomic_long_set(&dc->writeback_rate.rate, new_rate);
dc->writeback_rate_target = target;
}
+static bool set_at_max_writeback_rate(struct cache_set *c,
+ struct cached_dev *dc)
+{
+ /*
+ * Idle_counter is increased everytime when update_writeback_rate() is
+ * called. If all backing devices attached to the same cache set have
+ * identical dc->writeback_rate_update_seconds values, it is about 6
+ * rounds of update_writeback_rate() on each backing device before
+ * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
+ * to each dc->writeback_rate.rate.
+ * In order to avoid extra locking cost for counting exact dirty cached
+ * devices number, c->attached_dev_nr is used to calculate the idle
+ * throushold. It might be bigger if not all cached device are in write-
+ * back mode, but it still works well with limited extra rounds of
+ * update_writeback_rate().
+ */
+ if (atomic_inc_return(&c->idle_counter) <
+ atomic_read(&c->attached_dev_nr) * 6)
+ return false;
+
+ if (atomic_read(&c->at_max_writeback_rate) != 1)
+ atomic_set(&c->at_max_writeback_rate, 1);
+
+ atomic_long_set(&dc->writeback_rate.rate, INT_MAX);
+
+ /* keep writeback_rate_target as existing value */
+ dc->writeback_rate_proportional = 0;
+ dc->writeback_rate_integral_scaled = 0;
+ dc->writeback_rate_change = 0;
+
+ /*
+ * Check c->idle_counter and c->at_max_writeback_rate agagain in case
+ * new I/O arrives during before set_at_max_writeback_rate() returns.
+ * Then the writeback rate is set to 1, and its new value should be
+ * decided via __update_writeback_rate().
+ */
+ if ((atomic_read(&c->idle_counter) <
+ atomic_read(&c->attached_dev_nr) * 6) ||
+ !atomic_read(&c->at_max_writeback_rate))
+ return false;
+
+ return true;
+}
+
static void update_writeback_rate(struct work_struct *work)
{
struct cached_dev *dc = container_of(to_delayed_work(work),
@@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work)
return;
}
- down_read(&dc->writeback_lock);
-
- if (atomic_read(&dc->has_dirty) &&
- dc->writeback_percent)
- __update_writeback_rate(dc);
+ if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
+ /*
+ * If the whole cache set is idle, set_at_max_writeback_rate()
+ * will set writeback rate to a max number. Then it is
+ * unncessary to update writeback rate for an idle cache set
+ * in maximum writeback rate number(s).
+ */
+ if (!set_at_max_writeback_rate(c, dc)) {
+ down_read(&dc->writeback_lock);
+ __update_writeback_rate(dc);
+ up_read(&dc->writeback_lock);
+ }
+ }
- up_read(&dc->writeback_lock);
/*
* CACHE_SET_IO_DISABLE might be set via sysfs interface,
@@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc)
delay = writeback_delay(dc, size);
- /* If the control system would wait for at least half a
- * second, and there's been no reqs hitting the backing disk
- * for awhile: use an alternate mode where we have at most
- * one contiguous set of writebacks in flight at a time. If
- * someone wants to do IO it will be quick, as it will only
- * have to contend with one operation in flight, and we'll
- * be round-tripping data to the backing disk as quickly as
- * it can accept it.
- */
- if (delay >= HZ / 2) {
- /* 3 means at least 1.5 seconds, up to 7.5 if we
- * have slowed way down.
- */
- if (atomic_inc_return(&dc->backing_idle) >= 3) {
- /* Wait for current I/Os to finish */
- closure_sync(&cl);
- /* And immediately launch a new set. */
- delay = 0;
- }
- }
-
while (!kthread_should_stop() &&
!test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
delay) {
@@ -476,6 +507,9 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
if (!d)
return;
+ if (UUID_FLASH_ONLY(&c->uuids[inode]))
+ atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
+
stripe = offset_to_stripe(d, offset);
stripe_offset = offset & (d->stripe_size - 1);
@@ -673,10 +707,14 @@ static int bch_writeback_thread(void *arg)
}
/* Init */
+#define INIT_KEYS_EACH_TIME 500000
+#define INIT_KEYS_SLEEP_MS 100
struct sectors_dirty_init {
struct btree_op op;
unsigned inode;
+ size_t count;
+ struct bkey start;
};
static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
@@ -691,18 +729,37 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
KEY_START(k), KEY_SIZE(k));
+ op->count++;
+ if (atomic_read(&b->c->search_inflight) &&
+ !(op->count % INIT_KEYS_EACH_TIME)) {
+ bkey_copy_key(&op->start, k);
+ return -EAGAIN;
+ }
+
return MAP_CONTINUE;
}
void bch_sectors_dirty_init(struct bcache_device *d)
{
struct sectors_dirty_init op;
+ int ret;
bch_btree_op_init(&op.op, -1);
op.inode = d->id;
-
- bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
- sectors_dirty_init_fn, 0);
+ op.count = 0;
+ op.start = KEY(op.inode, 0, 0);
+
+ do {
+ ret = bch_btree_map_keys(&op.op, d->c, &op.start,
+ sectors_dirty_init_fn, 0);
+ if (ret == -EAGAIN)
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
+ else if (ret < 0) {
+ pr_warn("sectors dirty init failed, ret=%d!", ret);
+ break;
+ }
+ } while (ret == -EAGAIN);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -715,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_running = true;
dc->writeback_percent = 10;
dc->writeback_delay = 30;
- dc->writeback_rate.rate = 1024;
+ atomic_long_set(&dc->writeback_rate.rate, 1024);
dc->writeback_rate_minimum = 8;
dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 610fb01de629..3745d7004c47 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -28,25 +28,6 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
return ret;
}
-static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
-{
- uint64_t i, ret = 0;
-
- mutex_lock(&bch_register_lock);
-
- for (i = 0; i < c->devices_max_used; i++) {
- struct bcache_device *d = c->devices[i];
-
- if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
- continue;
- ret += bcache_dev_sectors_dirty(d);
- }
-
- mutex_unlock(&bch_register_lock);
-
- return ret;
-}
-
static inline unsigned offset_to_stripe(struct bcache_device *d,
uint64_t offset)
{
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index ab13fcec3fca..75df4c9d8b54 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -588,7 +588,7 @@ static const char *raid10_md_layout_to_format(int layout)
}
/* Return md raid10 algorithm for @name */
-static const int raid10_name_to_format(const char *name)
+static int raid10_name_to_format(const char *name)
{
if (!strcasecmp(name, "near"))
return ALGORITHM_RAID10_NEAR;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 938766794c2e..3d0e2c198f06 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -885,9 +885,7 @@ EXPORT_SYMBOL_GPL(dm_table_set_type);
static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
- struct request_queue *q = bdev_get_queue(dev->bdev);
-
- return q && blk_queue_dax(q);
+ return bdev_dax_supported(dev->bdev, PAGE_SIZE);
}
static bool dm_table_supports_dax(struct dm_table *t)
@@ -1907,6 +1905,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (dm_table_supports_dax(t))
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+ else
+ blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
+
if (dm_table_supports_dax_write_cache(t))
dax_write_cache(t->md->dax_dev, true);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 36ef284ad086..72142021b5c9 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -776,7 +776,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
static int __commit_transaction(struct dm_pool_metadata *pmd)
{
int r;
- size_t metadata_len, data_len;
struct thin_disk_superblock *disk_super;
struct dm_block *sblock;
@@ -797,14 +796,6 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
if (r < 0)
return r;
- r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
- if (r < 0)
- return r;
-
- r = dm_sm_root_size(pmd->data_sm, &data_len);
- if (r < 0)
- return r;
-
r = save_sm_roots(pmd);
if (r < 0)
return r;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 7945238df1c0..b900723bbd0f 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1386,6 +1386,8 @@ static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
+static void requeue_bios(struct pool *pool);
+
static void check_for_space(struct pool *pool)
{
int r;
@@ -1398,8 +1400,10 @@ static void check_for_space(struct pool *pool)
if (r)
return;
- if (nr_free)
+ if (nr_free) {
set_pool_mode(pool, PM_WRITE);
+ requeue_bios(pool);
+ }
}
/*
@@ -1476,7 +1480,10 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
r = dm_pool_alloc_data_block(pool->pmd, result);
if (r) {
- metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
+ if (r == -ENOSPC)
+ set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
+ else
+ metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
return r;
}
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 5961c7794ef3..87107c995cb5 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -136,6 +136,7 @@ struct dm_writecache {
struct dm_target *ti;
struct dm_dev *dev;
struct dm_dev *ssd_dev;
+ sector_t start_sector;
void *memory_map;
uint64_t memory_map_size;
size_t metadata_sectors;
@@ -259,7 +260,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
if (da != p) {
long i;
wc->memory_map = NULL;
- pages = kvmalloc(p * sizeof(struct page *), GFP_KERNEL);
+ pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
if (!pages) {
r = -ENOMEM;
goto err2;
@@ -293,6 +294,10 @@ static int persistent_memory_claim(struct dm_writecache *wc)
}
dax_read_unlock(id);
+
+ wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
+ wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
+
return 0;
err3:
kvfree(pages);
@@ -311,7 +316,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
static void persistent_memory_release(struct dm_writecache *wc)
{
if (wc->memory_vmapped)
- vunmap(wc->memory_map);
+ vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
}
static struct page *persistent_memory_page(void *addr)
@@ -359,7 +364,7 @@ static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
{
- return wc->metadata_sectors +
+ return wc->start_sector + wc->metadata_sectors +
((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
}
@@ -471,6 +476,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc)
if (unlikely(region.sector + region.count > wc->metadata_sectors))
region.count = wc->metadata_sectors - region.sector;
+ region.sector += wc->start_sector;
atomic_inc(&endio.count);
req.bi_op = REQ_OP_WRITE;
req.bi_op_flags = REQ_SYNC;
@@ -859,7 +865,7 @@ static int writecache_alloc_entries(struct dm_writecache *wc)
if (wc->entries)
return 0;
- wc->entries = vmalloc(sizeof(struct wc_entry) * wc->n_blocks);
+ wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
if (!wc->entries)
return -ENOMEM;
for (b = 0; b < wc->n_blocks; b++) {
@@ -1481,9 +1487,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
wb->page_offset = PAGE_SIZE;
if (max_pages <= WB_LIST_INLINE ||
- unlikely(!(wb->wc_list = kmalloc(max_pages * sizeof(struct wc_entry *),
- GFP_NOIO | __GFP_NORETRY |
- __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
+ unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
+ GFP_NOIO | __GFP_NORETRY |
+ __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
wb->wc_list = wb->wc_list_inline;
max_pages = WB_LIST_INLINE;
}
@@ -1946,14 +1952,6 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
- if (WC_MODE_PMEM(wc)) {
- r = persistent_memory_claim(wc);
- if (r) {
- ti->error = "Unable to map persistent memory for cache";
- goto bad;
- }
- }
-
/*
* Parse the cache block size
*/
@@ -1982,7 +1980,16 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
while (opt_params) {
string = dm_shift_arg(&as), opt_params--;
- if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
+ if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
+ unsigned long long start_sector;
+ string = dm_shift_arg(&as), opt_params--;
+ if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
+ goto invalid_optional;
+ wc->start_sector = start_sector;
+ if (wc->start_sector != start_sector ||
+ wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
+ goto invalid_optional;
+ } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
string = dm_shift_arg(&as), opt_params--;
if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
goto invalid_optional;
@@ -2039,12 +2046,20 @@ invalid_optional:
goto bad;
}
- if (!WC_MODE_PMEM(wc)) {
+ if (WC_MODE_PMEM(wc)) {
+ r = persistent_memory_claim(wc);
+ if (r) {
+ ti->error = "Unable to map persistent memory for cache";
+ goto bad;
+ }
+ } else {
struct dm_io_region region;
struct dm_io_request req;
size_t n_blocks, n_metadata_blocks;
uint64_t n_bitmap_bits;
+ wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
+
bio_list_init(&wc->flush_list);
wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
if (IS_ERR(wc->flush_thread)) {
@@ -2097,7 +2112,7 @@ invalid_optional:
}
region.bdev = wc->ssd_dev->bdev;
- region.sector = 0;
+ region.sector = wc->start_sector;
region.count = wc->metadata_sectors;
req.bi_op = REQ_OP_READ;
req.bi_op_flags = REQ_SYNC;
@@ -2265,7 +2280,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
static struct target_type writecache_target = {
.name = "writecache",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = writecache_ctr,
.dtr = writecache_dtr,
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 3c0e45f4dcf5..a44183ff4be0 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -787,7 +787,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
/* Chunk BIO work */
mutex_init(&dmz->chunk_lock);
- INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_KERNEL);
+ INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
0, dev->name);
if (!dmz->chunk_wq) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e65429a29c06..20f7e4ef5342 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io)
io->start_time = jiffies;
- generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0);
+ generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
+ &dm_disk(md)->part0);
atomic_set(&dm_disk(md)->part0.in_flight[rw],
atomic_inc_return(&md->pending[rw]));
@@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io)
int pending;
int rw = bio_data_dir(bio);
- generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
+ generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
+ io->start_time);
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@ -1056,8 +1058,7 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (len < 1)
goto out;
nr_pages = min(len, nr_pages);
- if (ti->type->direct_access)
- ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
+ ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
out:
dm_put_live_table(md, srcu_idx);
@@ -1606,10 +1607,9 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
* the usage of io->orig_bio in dm_remap_zone_report()
* won't be affected by this reassignment.
*/
- struct bio *b = bio_clone_bioset(bio, GFP_NOIO,
- &md->queue->bio_split);
+ struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
+ GFP_NOIO, &md->queue->bio_split);
ci.io->orig_bio = b;
- bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9);
bio_chain(b, bio);
ret = generic_make_request(bio);
break;
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 021cbf9ef1bf..e8a74e92be30 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -304,15 +304,6 @@ static void recover_bitmaps(struct md_thread *thread)
while (cinfo->recovery_map) {
slot = fls64((u64)cinfo->recovery_map) - 1;
- /* Clear suspend_area associated with the bitmap */
- spin_lock_irq(&cinfo->suspend_lock);
- list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
- if (slot == s->slot) {
- list_del(&s->list);
- kfree(s);
- }
- spin_unlock_irq(&cinfo->suspend_lock);
-
snprintf(str, 64, "bitmap%04d", slot);
bm_lockres = lockres_init(mddev, str, NULL, 1);
if (!bm_lockres) {
@@ -331,14 +322,30 @@ static void recover_bitmaps(struct md_thread *thread)
pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
goto clear_bit;
}
+
+ /* Clear suspend_area associated with the bitmap */
+ spin_lock_irq(&cinfo->suspend_lock);
+ list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
+ if (slot == s->slot) {
+ list_del(&s->list);
+ kfree(s);
+ }
+ spin_unlock_irq(&cinfo->suspend_lock);
+
if (hi > 0) {
if (lo < mddev->recovery_cp)
mddev->recovery_cp = lo;
/* wake up thread to continue resync in case resync
* is not finished */
if (mddev->recovery_cp != MaxSector) {
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
+ /*
+ * clear the REMOTE flag since we will launch
+ * resync thread in current node.
+ */
+ clear_bit(MD_RESYNCING_REMOTE,
+ &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
}
}
clear_bit:
@@ -457,6 +464,11 @@ static void process_suspend_info(struct mddev *mddev,
struct suspend_info *s;
if (!hi) {
+ /*
+ * clear the REMOTE flag since resync or recovery is finished
+ * in remote node.
+ */
+ clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
remove_suspend_info(mddev, slot);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -585,6 +597,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
revalidate_disk(mddev->gendisk);
break;
case RESYNCING:
+ set_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
process_suspend_info(mddev, le32_to_cpu(msg->slot),
le64_to_cpu(msg->low),
le64_to_cpu(msg->high));
@@ -1265,8 +1278,18 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
static int resync_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
dlm_unlock_sync(cinfo->resync_lockres);
- return resync_info_update(mddev, 0, 0);
+
+ /*
+ * If resync thread is interrupted so we can't say resync is finished,
+ * another node will launch resync thread to continue.
+ */
+ if (test_bit(MD_CLOSING, &mddev->flags))
+ return 0;
+ else
+ return resync_info_update(mddev, 0, 0);
}
static int area_resyncing(struct mddev *mddev, int direction,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 29b0cd9ec951..724def2f9eaa 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -204,10 +204,6 @@ static int start_readonly;
*/
static bool create_on_open = true;
-/* bio_clone_mddev
- * like bio_clone_bioset, but with a local bio set
- */
-
struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev)
{
@@ -335,6 +331,7 @@ EXPORT_SYMBOL(md_handle_request);
static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
{
const int rw = bio_data_dir(bio);
+ const int sgrp = op_stat_group(bio_op(bio));
struct mddev *mddev = q->queuedata;
unsigned int sectors;
int cpu;
@@ -363,8 +360,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
md_handle_request(mddev, bio);
cpu = part_stat_lock();
- part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+ part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
+ part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
part_stat_unlock();
return BLK_QC_T_NONE;
@@ -5547,7 +5544,8 @@ int md_run(struct mddev *mddev)
else
pr_warn("md: personality for level %s is not loaded!\n",
mddev->clevel);
- return -EINVAL;
+ err = -EINVAL;
+ goto abort;
}
spin_unlock(&pers_lock);
if (mddev->level != pers->level) {
@@ -5560,7 +5558,8 @@ int md_run(struct mddev *mddev)
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
module_put(pers->owner);
- return -EINVAL;
+ err = -EINVAL;
+ goto abort;
}
if (pers->sync_request) {
@@ -5629,7 +5628,7 @@ int md_run(struct mddev *mddev)
mddev->private = NULL;
module_put(pers->owner);
bitmap_destroy(mddev);
- return err;
+ goto abort;
}
if (mddev->queue) {
bool nonrot = true;
@@ -7678,6 +7677,23 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
resync -= atomic_read(&mddev->recovery_active);
if (resync == 0) {
+ if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Faulty, &rdev->flags) &&
+ rdev->recovery_offset != MaxSector &&
+ rdev->recovery_offset) {
+ seq_printf(seq, "\trecover=REMOTE");
+ return 1;
+ }
+ if (mddev->reshape_position != MaxSector)
+ seq_printf(seq, "\treshape=REMOTE");
+ else
+ seq_printf(seq, "\tresync=REMOTE");
+ return 1;
+ }
if (mddev->recovery_cp < MaxSector) {
seq_printf(seq, "\tresync=PENDING");
return 1;
@@ -8044,8 +8060,7 @@ static int is_mddev_idle(struct mddev *mddev, int init)
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
- curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
- (int)part_stat_read(&disk->part0, sectors[1]) -
+ curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
atomic_read(&disk->sync_io);
/* sync IO will cause sync_io to increase before the disk_stats
* as sync_io is counted when a request starts, and
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2d148bdaba74..8afd6bfdbfb9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -496,6 +496,7 @@ enum recovery_flags {
MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */
MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */
MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */
+ MD_RESYNCING_REMOTE, /* remote node is running resync thread */
};
static inline int __must_check mddev_lock(struct mddev *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 478cf446827f..35bd3a62451b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3893,6 +3893,13 @@ static int raid10_run(struct mddev *mddev)
disk->rdev->saved_raid_disk < 0)
conf->fullsync = 1;
}
+
+ if (disk->replacement &&
+ !test_bit(In_sync, &disk->replacement->flags) &&
+ disk->replacement->saved_raid_disk < 0) {
+ conf->fullsync = 1;
+ }
+
disk->recovery_disabled = mddev->recovery_disabled - 1;
}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 2b775abf377b..7416db70c6cc 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -717,7 +717,6 @@ static void r5c_disable_writeback_async(struct work_struct *work)
static void r5l_submit_current_io(struct r5l_log *log)
{
struct r5l_io_unit *io = log->current_io;
- struct bio *bio;
struct r5l_meta_block *block;
unsigned long flags;
u32 crc;
@@ -730,7 +729,6 @@ static void r5l_submit_current_io(struct r5l_log *log)
block->meta_size = cpu_to_le32(io->meta_offset);
crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
block->checksum = cpu_to_le32(crc);
- bio = io->current_bio;
log->current_io = NULL;
spin_lock_irqsave(&log->io_list_lock, flags);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2031506a0ecd..81eaa221216c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -409,16 +409,14 @@ void raid5_release_stripe(struct stripe_head *sh)
md_wakeup_thread(conf->mddev->thread);
return;
slow_path:
- local_irq_save(flags);
/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
- if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
+ if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
INIT_LIST_HEAD(&list);
hash = sh->hash_lock_index;
do_release_stripe(conf, sh, &list);
- spin_unlock(&conf->device_lock);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
release_inactive_stripe_list(conf, &list, hash);
}
- local_irq_restore(flags);
}
static inline void remove_hash(struct stripe_head *sh)
@@ -4521,6 +4519,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
s->failed++;
if (rdev && !test_bit(Faulty, &rdev->flags))
do_recovery = 1;
+ else if (!rdev) {
+ rdev = rcu_dereference(
+ conf->disks[i].replacement);
+ if (rdev && !test_bit(Faulty, &rdev->flags))
+ do_recovery = 1;
+ }
}
if (test_bit(R5_InJournal, &dev->flags))