28 files changed, 521 insertions, 200 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d6bf294f3907..05f82ff6f016 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -328,13 +328,6 @@ struct cached_dev {
 	 */
 	atomic_t		has_dirty;
 
-	/*
-	 * Set to zero by things that touch the backing volume-- except
-	 * writeback.  Incremented by writeback.  Used to determine when to
-	 * accelerate idle writeback.
-	 */
-	atomic_t		backing_idle;
-
 	struct bch_ratelimit	writeback_rate;
 	struct delayed_work	writeback_rate_update;
 
@@ -423,9 +416,9 @@ struct cache {
 	/*
 	 * When allocating new buckets, prio_write() gets first dibs - since we
 	 * may not be allocate at all without writing priorities and gens.
-	 * prio_buckets[] contains the last buckets we wrote priorities to (so
-	 * gc can mark them as metadata), prio_next[] contains the buckets
-	 * allocated for the next prio write.
+	 * prio_last_buckets[] contains the last buckets we wrote priorities to
+	 * (so gc can mark them as metadata), prio_buckets[] contains the
+	 * buckets allocated for the next prio write.
 	 */
 	uint64_t		*prio_buckets;
 	uint64_t		*prio_last_buckets;
@@ -474,6 +467,7 @@ struct cache {
 
 struct gc_stat {
 	size_t			nodes;
+	size_t			nodes_pre;
 	size_t			key_bytes;
 
 	size_t			nkeys;
@@ -514,6 +508,8 @@ struct cache_set {
 	struct cache_accounting accounting;
 
 	unsigned long		flags;
+	atomic_t		idle_counter;
+	atomic_t		at_max_writeback_rate;
 
 	struct cache_sb		sb;
 
@@ -523,8 +519,10 @@ struct cache_set {
 
 	struct bcache_device	**devices;
 	unsigned		devices_max_used;
+	atomic_t		attached_dev_nr;
 	struct list_head	cached_devs;
 	uint64_t		cached_dev_sectors;
+	atomic_long_t		flash_dev_dirty_sectors;
 	struct closure		caching;
 
 	struct closure		sb_write;
@@ -603,6 +601,10 @@ struct cache_set {
 	 */
 	atomic_t		rescale;
 	/*
+	 * used for GC, identify if any front side I/Os is inflight
+	 */
+	atomic_t		search_inflight;
+	/*
 	 * When we invalidate buckets, we use both the priority and the amount
 	 * of good data to determine which buckets to reuse first - to weight
 	 * those together consistently we keep track of the smallest nonzero
@@ -995,7 +997,7 @@ void bch_open_buckets_free(struct cache_set *);
 int bch_cache_allocator_start(struct cache *ca);
 
 void bch_debug_exit(void);
-int bch_debug_init(struct kobject *);
+void bch_debug_init(struct kobject *kobj);
 void bch_request_exit(void);
 int bch_request_init(void);
 
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index f3403b45bc28..596c93b44e9b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -366,6 +366,10 @@ EXPORT_SYMBOL(bch_btree_keys_init);
 
 /* Binary tree stuff for auxiliary search trees */
 
+/*
+ * return array index next to j when does in-order traverse
+ * of a binary tree which is stored in a linear array
+ */
 static unsigned inorder_next(unsigned j, unsigned size)
 {
 	if (j * 2 + 1 < size) {
@@ -379,6 +383,10 @@ static unsigned inorder_next(unsigned j, unsigned size)
 	return j;
 }
 
+/*
+ * return array index previous to j when does in-order traverse
+ * of a binary tree which is stored in a linear array
+ */
 static unsigned inorder_prev(unsigned j, unsigned size)
 {
 	if (j * 2 < size) {
@@ -421,6 +429,10 @@ static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
 	return j;
 }
 
+/*
+ * Return the cacheline index in bset_tree->data, where j is index
+ * from a linear array which stores the auxiliar binary tree
+ */
 static unsigned to_inorder(unsigned j, struct bset_tree *t)
 {
 	return __to_inorder(j, t->size, t->extra);
@@ -441,6 +453,10 @@ static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
 	return j;
 }
 
+/*
+ * Return an index from a linear array which stores the auxiliar binary
+ * tree, j is the cacheline index of t->data.
+ */
 static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
 {
 	return __inorder_to_tree(j, t->size, t->extra);
@@ -546,6 +562,20 @@ static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
 	return low;
 }
 
+/*
+ * Calculate mantissa value for struct bkey_float.
+ * If most significant bit of f->exponent is not set, then
+ *  - f->exponent >> 6 is 0
+ *  - p[0] points to bkey->low
+ *  - p[-1] borrows bits from KEY_INODE() of bkey->high
+ * if most isgnificant bits of f->exponent is set, then
+ *  - f->exponent >> 6 is 1
+ *  - p[0] points to bits from KEY_INODE() of bkey->high
+ *  - p[-1] points to other bits from KEY_INODE() of
+ *    bkey->high too.
+ * See make_bfloat() to check when most significant bit of f->exponent
+ * is set or not.
+ */
 static inline unsigned bfloat_mantissa(const struct bkey *k,
 				       struct bkey_float *f)
 {
@@ -570,6 +600,16 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
 	BUG_ON(m < l || m > r);
 	BUG_ON(bkey_next(p) != m);
 
+	/*
+	 * If l and r have different KEY_INODE values (different backing
+	 * device), f->exponent records how many least significant bits
+	 * are different in KEY_INODE values and sets most significant
+	 * bits to 1 (by +64).
+	 * If l and r have same KEY_INODE value, f->exponent records
+	 * how many different bits in least significant bits of bkey->low.
+	 * See bfloat_mantiss() how the most significant bit of
+	 * f->exponent is used to calculate bfloat mantissa value.
+	 */
 	if (KEY_INODE(l) != KEY_INODE(r))
 		f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
 	else
@@ -633,6 +673,15 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
 }
 EXPORT_SYMBOL(bch_bset_init_next);
 
+/*
+ * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to
+ * accelerate bkey search in a btree node (pointed by bset_tree->data in
+ * memory). After search in the auxiliar tree by calling bset_search_tree(),
+ * a struct bset_search_iter is returned which indicates range [l, r] from
+ * bset_tree->data where the searching bkey might be inside. Then a followed
+ * linear comparison does the exact search, see __bch_bset_search() for how
+ * the auxiliary tree is used.
+ */
 void bch_bset_build_written_tree(struct btree_keys *b)
 {
 	struct bset_tree *t = bset_tree_last(b);
@@ -898,6 +947,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
 	unsigned inorder, j, n = 1;
 
 	do {
+		/*
+		 * A bit trick here.
+		 * If p < t->size, (int)(p - t->size) is a minus value and
+		 * the most significant bit is set, right shifting 31 bits
+		 * gets 1. If p >= t->size, the most significant bit is
+		 * not set, right shifting 31 bits gets 0.
+		 * So the following 2 lines equals to
+		 *	if (p >= t->size)
+		 *		p = 0;
+		 * but a branch instruction is avoided.
+		 */
 		unsigned p = n << 4;
 		p &= ((int) (p - t->size)) >> 31;
 
@@ -907,6 +967,9 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
 		f = &t->tree[j];
 
 		/*
+		 * Similar bit trick, use subtract operation to avoid a branch
+		 * instruction.
+		 *
 		 * n = (f->mantissa > bfloat_mantissa())
 		 *	? j * 2
 		 *	: j * 2 + 1;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 547c9eedc2f4..c19f7716df88 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -90,6 +90,9 @@
 
 #define MAX_NEED_GC		64
 #define MAX_SAVE_PRIO		72
+#define MAX_GC_TIMES		100
+#define MIN_GC_NODES		100
+#define GC_SLEEP_MS		100
 
 #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
 
@@ -1008,6 +1011,13 @@ retry:
 		BUG_ON(b->level != level);
 	}
 
+	if (btree_node_io_error(b)) {
+		rw_unlock(write, b);
+		return ERR_PTR(-EIO);
+	}
+
+	BUG_ON(!b->written);
+
 	b->parent = parent;
 	b->accessed = 1;
 
@@ -1019,13 +1029,6 @@ retry:
 	for (; i <= b->keys.nsets; i++)
 		prefetch(b->keys.set[i].data);
 
-	if (btree_node_io_error(b)) {
-		rw_unlock(write, b);
-		return ERR_PTR(-EIO);
-	}
-
-	BUG_ON(!b->written);
-
 	return b;
 }
 
@@ -1520,6 +1523,32 @@ static unsigned btree_gc_count_keys(struct btree *b)
 	return ret;
 }
 
+static size_t btree_gc_min_nodes(struct cache_set *c)
+{
+	size_t min_nodes;
+
+	/*
+	 * Since incremental GC would stop 100ms when front
+	 * side I/O comes, so when there are many btree nodes,
+	 * if GC only processes constant (100) nodes each time,
+	 * GC would last a long time, and the front side I/Os
+	 * would run out of the buckets (since no new bucket
+	 * can be allocated during GC), and be blocked again.
+	 * So GC should not process constant nodes, but varied
+	 * nodes according to the number of btree nodes, which
+	 * realized by dividing GC into constant(100) times,
+	 * so when there are many btree nodes, GC can process
+	 * more nodes each time, otherwise, GC will process less
+	 * nodes each time (but no less than MIN_GC_NODES)
+	 */
+	min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
+	if (min_nodes < MIN_GC_NODES)
+		min_nodes = MIN_GC_NODES;
+
+	return min_nodes;
+}
+
+
 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 			    struct closure *writes, struct gc_stat *gc)
 {
@@ -1585,6 +1614,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
 		r->b = NULL;
 
+		if (atomic_read(&b->c->search_inflight) &&
+		    gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+			gc->nodes_pre =  gc->nodes;
+			ret = -EAGAIN;
+			break;
+		}
+
 		if (need_resched()) {
 			ret = -EAGAIN;
 			break;
@@ -1753,7 +1789,10 @@ static void bch_btree_gc(struct cache_set *c)
 		closure_sync(&writes);
 		cond_resched();
 
-		if (ret && ret != -EAGAIN)
+		if (ret == -EAGAIN)
+			schedule_timeout_interruptible(msecs_to_jiffies
+						       (GC_SLEEP_MS));
+		else if (ret)
 			pr_warn("gc failed!");
 	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
 
@@ -1834,8 +1873,14 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
 		do {
 			k = bch_btree_iter_next_filter(&iter, &b->keys,
 						       bch_ptr_bad);
-			if (k)
+			if (k) {
 				btree_node_prefetch(b, k);
+				/*
+				 * initiallize c->gc_stats.nodes
+				 * for incremental GC
+				 */
+				b->c->gc_stats.nodes++;
+			}
 
 			if (p)
 				ret = btree(check_recurse, p, b, op);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index d211e2c25b6b..68e9d926134d 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -152,7 +152,7 @@ static inline bool btree_node_ ## flag(struct btree *b)			\
 {	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
 									\
 static inline void set_btree_node_ ## flag(struct btree *b)		\
-{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
+{	set_bit(BTREE_NODE_ ## flag, &b->flags); }
 
 enum btree_flags {
 	BTREE_NODE_io_error,
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 0e14969182c6..618253683d40 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -199,11 +199,16 @@ static const struct file_operations debug_ops = {
 	.release	= single_release
 };
 
-int __init closure_debug_init(void)
+void  __init closure_debug_init(void)
 {
-	closure_debug = debugfs_create_file("closures",
-				0400, bcache_debug, NULL, &debug_ops);
-	return IS_ERR_OR_NULL(closure_debug);
+	if (!IS_ERR_OR_NULL(bcache_debug))
+		/*
+		 * it is unnecessary to check return value of
+		 * debugfs_create_file(), we should not care
+		 * about this.
+		 */
+		closure_debug = debugfs_create_file(
+			"closures", 0400, bcache_debug, NULL, &debug_ops);
 }
 #endif
 
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 71427eb5fdae..7c2c5bc7c88b 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -186,13 +186,13 @@ static inline void closure_sync(struct closure *cl)
 
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 
-int closure_debug_init(void);
+void closure_debug_init(void);
 void closure_debug_create(struct closure *cl);
 void closure_debug_destroy(struct closure *cl);
 
 #else
 
-static inline int closure_debug_init(void) { return 0; }
+static inline void closure_debug_init(void) {}
 static inline void closure_debug_create(struct closure *cl) {}
 static inline void closure_debug_destroy(struct closure *cl) {}
 
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index d030ce3025a6..12034c07257b 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -110,11 +110,15 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 	struct bio_vec bv, cbv;
 	struct bvec_iter iter, citer = { 0 };
 
-	check = bio_clone_kmalloc(bio, GFP_NOIO);
+	check = bio_kmalloc(GFP_NOIO, bio_segments(bio));
 	if (!check)
 		return;
+	check->bi_disk = bio->bi_disk;
 	check->bi_opf = REQ_OP_READ;
+	check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
+	check->bi_iter.bi_size = bio->bi_iter.bi_size;
 
+	bch_bio_map(check, NULL);
 	if (bch_bio_alloc_pages(check, GFP_NOIO))
 		goto out_put;
 
@@ -248,11 +252,12 @@ void bch_debug_exit(void)
 		debugfs_remove_recursive(bcache_debug);
 }
 
-int __init bch_debug_init(struct kobject *kobj)
+void __init bch_debug_init(struct kobject *kobj)
 {
-	if (!IS_ENABLED(CONFIG_DEBUG_FS))
-		return 0;
-
+	/*
+	 * it is unnecessary to check return value of
+	 * debugfs_create_file(), we should not care
+	 * about this.
+	 */
 	bcache_debug = debugfs_create_dir("bcache", NULL);
-	return IS_ERR_OR_NULL(bcache_debug);
 }
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 18f1b5239620..10748c626a1d 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -828,6 +828,7 @@ void bch_journal_free(struct cache_set *c)
 	free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
 	free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
 	free_fifo(&c->journal.pin);
+	free_heap(&c->flush_btree);
 }
 
 int bch_journal_alloc(struct cache_set *c)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ae67f5fa8047..7dbe8b6316a0 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -107,7 +107,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
 	/*
 	 * The journalling code doesn't handle the case where the keys to insert
 	 * is bigger than an empty write: If we just return -ENOMEM here,
-	 * bio_insert() and bio_invalidate() will insert the keys created so far
+	 * bch_data_insert_keys() will insert the keys created so far
 	 * and finish the rest when the keylist is empty.
 	 */
 	if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
@@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio)
 static void bio_complete(struct search *s)
 {
 	if (s->orig_bio) {
-		generic_end_io_acct(s->d->disk->queue,
-				    bio_data_dir(s->orig_bio),
+		generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio),
 				    &s->d->disk->part0, s->start_time);
 
 		trace_bcache_request_end(s->d, s->orig_bio);
@@ -702,6 +701,8 @@ static void search_free(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 
+	atomic_dec(&s->d->c->search_inflight);
+
 	if (s->iop.bio)
 		bio_put(s->iop.bio);
 
@@ -719,6 +720,7 @@ static inline struct search *search_alloc(struct bio *bio,
 
 	closure_init(&s->cl, NULL);
 	do_bio_hook(s, bio, request_endio);
+	atomic_inc(&d->c->search_inflight);
 
 	s->orig_bio		= bio;
 	s->cache_miss		= NULL;
@@ -1062,8 +1064,7 @@ static void detached_dev_end_io(struct bio *bio)
 	bio->bi_end_io = ddip->bi_end_io;
 	bio->bi_private = ddip->bi_private;
 
-	generic_end_io_acct(ddip->d->disk->queue,
-			    bio_data_dir(bio),
+	generic_end_io_acct(ddip->d->disk->queue, bio_op(bio),
 			    &ddip->d->disk->part0, ddip->start_time);
 
 	if (bio->bi_status) {
@@ -1102,6 +1103,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
 		generic_make_request(bio);
 }
 
+static void quit_max_writeback_rate(struct cache_set *c,
+				    struct cached_dev *this_dc)
+{
+	int i;
+	struct bcache_device *d;
+	struct cached_dev *dc;
+
+	/*
+	 * mutex bch_register_lock may compete with other parallel requesters,
+	 * or attach/detach operations on other backing device. Waiting to
+	 * the mutex lock may increase I/O request latency for seconds or more.
+	 * To avoid such situation, if mutext_trylock() failed, only writeback
+	 * rate of current cached device is set to 1, and __update_write_back()
+	 * will decide writeback rate of other cached devices (remember now
+	 * c->idle_counter is 0 already).
+	 */
+	if (mutex_trylock(&bch_register_lock)) {
+		for (i = 0; i < c->devices_max_used; i++) {
+			if (!c->devices[i])
+				continue;
+
+			if (UUID_FLASH_ONLY(&c->uuids[i]))
+				continue;
+
+			d = c->devices[i];
+			dc = container_of(d, struct cached_dev, disk);
+			/*
+			 * set writeback rate to default minimum value,
+			 * then let update_writeback_rate() to decide the
+			 * upcoming rate.
+			 */
+			atomic_long_set(&dc->writeback_rate.rate, 1);
+		}
+		mutex_unlock(&bch_register_lock);
+	} else
+		atomic_long_set(&this_dc->writeback_rate.rate, 1);
+}
+
 /* Cached devices - read & write stuff */
 
 static blk_qc_t cached_dev_make_request(struct request_queue *q,
@@ -1119,8 +1158,25 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
 		return BLK_QC_T_NONE;
 	}
 
-	atomic_set(&dc->backing_idle, 0);
-	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+	if (likely(d->c)) {
+		if (atomic_read(&d->c->idle_counter))
+			atomic_set(&d->c->idle_counter, 0);
+		/*
+		 * If at_max_writeback_rate of cache set is true and new I/O
+		 * comes, quit max writeback rate of all cached devices
+		 * attached to this cache set, and set at_max_writeback_rate
+		 * to false.
+		 */
+		if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
+			atomic_set(&d->c->at_max_writeback_rate, 0);
+			quit_max_writeback_rate(d->c, dc);
+		}
+	}
+
+	generic_start_io_acct(q,
+			      bio_op(bio),
+			      bio_sectors(bio),
+			      &d->disk->part0);
 
 	bio_set_dev(bio, dc->bdev);
 	bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1229,7 +1285,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
 	struct search *s;
 	struct closure *cl;
 	struct bcache_device *d = bio->bi_disk->private_data;
-	int rw = bio_data_dir(bio);
 
 	if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
 		bio->bi_status = BLK_STS_IOERR;
@@ -1237,7 +1292,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
 		return BLK_QC_T_NONE;
 	}
 
-	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+	generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
 
 	s = search_alloc(bio, d);
 	cl = &s->cl;
@@ -1254,7 +1309,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
 				      flash_dev_nodata,
 				      bcache_wq);
 		return BLK_QC_T_NONE;
-	} else if (rw) {
+	} else if (bio_data_dir(bio)) {
 		bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
 					&KEY(d->id, bio->bi_iter.bi_sector, 0),
 					&KEY(d->id, bio_end_sector(bio), 0));
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fa4058e43202..55a37641aa95 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -181,7 +181,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 		goto err;
 	}
 
-	sb->last_mount = get_seconds();
+	sb->last_mount = (u32)ktime_get_real_seconds();
 	err = NULL;
 
 	get_page(bh->b_page);
@@ -696,12 +696,14 @@ static void bcache_device_detach(struct bcache_device *d)
 {
 	lockdep_assert_held(&bch_register_lock);
 
+	atomic_dec(&d->c->attached_dev_nr);
+
 	if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
 		struct uuid_entry *u = d->c->uuids + d->id;
 
 		SET_UUID_FLASH_ONLY(u, 0);
 		memcpy(u->uuid, invalid_uuid, 16);
-		u->invalidated = cpu_to_le32(get_seconds());
+		u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
 		bch_uuid_write(d->c);
 	}
 
@@ -796,11 +798,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 		return idx;
 
 	if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
-			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
-	    !(d->disk = alloc_disk(BCACHE_MINORS))) {
-		ida_simple_remove(&bcache_device_idx, idx);
-		return -ENOMEM;
-	}
+			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
+		goto err;
+
+	d->disk = alloc_disk(BCACHE_MINORS);
+	if (!d->disk)
+		goto err;
 
 	set_capacity(d->disk, sectors);
 	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
@@ -834,6 +837,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	blk_queue_write_cache(q, true, true);
 
 	return 0;
+
+err:
+	ida_simple_remove(&bcache_device_idx, idx);
+	return -ENOMEM;
+
 }
 
 /* Cached device */
@@ -1027,7 +1035,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
 int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 			  uint8_t *set_uuid)
 {
-	uint32_t rtime = cpu_to_le32(get_seconds());
+	uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
 	struct uuid_entry *u;
 	struct cached_dev *exist_dc, *t;
 
@@ -1070,7 +1078,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 	    (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
 	     BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
 		memcpy(u->uuid, invalid_uuid, 16);
-		u->invalidated = cpu_to_le32(get_seconds());
+		u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
 		u = NULL;
 	}
 
@@ -1138,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 
 	bch_cached_dev_run(dc);
 	bcache_device_link(&dc->disk, c, "bdev");
+	atomic_inc(&c->attached_dev_nr);
 
 	/* Allow the writeback thread to proceed */
 	up_write(&dc->writeback_lock);
@@ -1285,6 +1294,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
 	pr_info("registered backing device %s", dc->backing_dev_name);
 
 	list_add(&dc->list, &uncached_devices);
+	/* attach to a matched cache set if it exists */
 	list_for_each_entry(c, &bch_cache_sets, list)
 		bch_cached_dev_attach(dc, c, NULL);
 
@@ -1311,6 +1321,8 @@ static void flash_dev_free(struct closure *cl)
 {
 	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
 	mutex_lock(&bch_register_lock);
+	atomic_long_sub(bcache_dev_sectors_dirty(d),
+			&d->c->flash_dev_dirty_sectors);
 	bcache_device_free(d);
 	mutex_unlock(&bch_register_lock);
 	kobject_put(&d->kobj);
@@ -1390,7 +1402,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
 
 	get_random_bytes(u->uuid, 16);
 	memset(u->label, 0, 32);
-	u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
+	u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
 
 	SET_UUID_FLASH_ONLY(u, 1);
 	u->sectors = size >> 9;
@@ -1687,6 +1699,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	c->block_bits		= ilog2(sb->block_size);
 	c->nr_uuids		= bucket_bytes(c) / sizeof(struct uuid_entry);
 	c->devices_max_used	= 0;
+	atomic_set(&c->attached_dev_nr, 0);
 	c->btree_pages		= bucket_pages(c);
 	if (c->btree_pages > BTREE_MAX_PAGES)
 		c->btree_pages = max_t(int, c->btree_pages / 4,
@@ -1894,7 +1907,7 @@ static void run_cache_set(struct cache_set *c)
 		goto err;
 
 	closure_sync(&cl);
-	c->sb.last_mount = get_seconds();
+	c->sb.last_mount = (u32)ktime_get_real_seconds();
 	bcache_write_super(c);
 
 	list_for_each_entry_safe(dc, t, &uncached_devices, list)
@@ -2163,8 +2176,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	if (!try_module_get(THIS_MODULE))
 		return -EBUSY;
 
-	if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
-	    !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
+	path = kstrndup(buffer, size, GFP_KERNEL);
+	if (!path)
+		goto err;
+
+	sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
+	if (!sb)
 		goto err;
 
 	err = "failed to open device";
@@ -2324,13 +2341,21 @@ static int __init bcache_init(void)
 		return bcache_major;
 	}
 
-	if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
-	    !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
-	    bch_request_init() ||
-	    bch_debug_init(bcache_kobj) || closure_debug_init() ||
+	bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
+	if (!bcache_wq)
+		goto err;
+
+	bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
+	if (!bcache_kobj)
+		goto err;
+
+	if (bch_request_init() ||
 	    sysfs_create_files(bcache_kobj, files))
 		goto err;
 
+	bch_debug_init(bcache_kobj);
+	closure_debug_init();
+
 	return 0;
 err:
 	bcache_exit();
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 225b15aa0340..81d3520b0702 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -149,6 +149,7 @@ SHOW(__bch_cached_dev)
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
 					     disk.kobj);
 	const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
+	int wb = dc->writeback_running;
 
 #define var(stat)		(dc->stat)
 
@@ -170,7 +171,8 @@ SHOW(__bch_cached_dev)
 	var_printf(writeback_running,	"%i");
 	var_print(writeback_delay);
 	var_print(writeback_percent);
-	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
+	sysfs_hprint(writeback_rate,
+		     wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0);
 	sysfs_hprint(io_errors,		atomic_read(&dc->io_errors));
 	sysfs_printf(io_error_limit,	"%i", dc->error_limit);
 	sysfs_printf(io_disable,	"%i", dc->io_disable);
@@ -188,15 +190,22 @@ SHOW(__bch_cached_dev)
 		char change[20];
 		s64 next_io;
 
-		bch_hprint(rate,	dc->writeback_rate.rate << 9);
-		bch_hprint(dirty,	bcache_dev_sectors_dirty(&dc->disk) << 9);
-		bch_hprint(target,	dc->writeback_rate_target << 9);
-		bch_hprint(proportional,dc->writeback_rate_proportional << 9);
-		bch_hprint(integral,	dc->writeback_rate_integral_scaled << 9);
-		bch_hprint(change,	dc->writeback_rate_change << 9);
-
-		next_io = div64_s64(dc->writeback_rate.next - local_clock(),
-				    NSEC_PER_MSEC);
+		/*
+		 * Except for dirty and target, other values should
+		 * be 0 if writeback is not running.
+		 */
+		bch_hprint(rate,
+			   wb ? atomic_long_read(&dc->writeback_rate.rate) << 9
+			      : 0);
+		bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
+		bch_hprint(target, dc->writeback_rate_target << 9);
+		bch_hprint(proportional,
+			   wb ? dc->writeback_rate_proportional << 9 : 0);
+		bch_hprint(integral,
+			   wb ? dc->writeback_rate_integral_scaled << 9 : 0);
+		bch_hprint(change, wb ? dc->writeback_rate_change << 9 : 0);
+		next_io = wb ? div64_s64(dc->writeback_rate.next-local_clock(),
+					 NSEC_PER_MSEC) : 0;
 
 		return sprintf(buf,
 			       "rate:\t\t%s/sec\n"
@@ -255,8 +264,19 @@ STORE(__cached_dev)
 
 	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
 
-	sysfs_strtoul_clamp(writeback_rate,
-			    dc->writeback_rate.rate, 1, INT_MAX);
+	if (attr == &sysfs_writeback_rate) {
+		ssize_t ret;
+		long int v = atomic_long_read(&dc->writeback_rate.rate);
+
+		ret = strtoul_safe_clamp(buf, v, 1, INT_MAX);
+
+		if (!ret) {
+			atomic_long_set(&dc->writeback_rate.rate, v);
+			ret = size;
+		}
+
+		return ret;
+	}
 
 	sysfs_strtoul_clamp(writeback_rate_update_seconds,
 			    dc->writeback_rate_update_seconds,
@@ -338,8 +358,8 @@ STORE(__cached_dev)
 			if (!v)
 				return size;
 		}
-
-		pr_err("Can't attach %s: cache set not found", buf);
+		if (v == -ENOENT)
+			pr_err("Can't attach %s: cache set not found", buf);
 		return v;
 	}
 
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index fc479b026d6d..b15256bcf0e7 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 {
 	uint64_t now = local_clock();
 
-	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+	d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate));
 
 	/* Bound the time.  Don't let us fall further than 2 seconds behind
 	 * (this prevents unnecessary backlog that would make it impossible
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cced87f8eb27..f7b0133c9d2f 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -442,7 +442,7 @@ struct bch_ratelimit {
 	 * Rate at which we want to do work, in units per second
 	 * The units here correspond to the units passed to bch_next_delay()
 	 */
-	uint32_t		rate;
+	atomic_long_t		rate;
 };
 
 static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ad45ebe1a74b..481d4cf38ac0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -27,7 +27,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
 	 * flash-only devices
 	 */
 	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
-				bcache_flash_devs_sectors_dirty(c);
+				atomic_long_read(&c->flash_dev_dirty_sectors);
 
 	/*
 	 * Unfortunately there is no control of global dirty data.  If the
@@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc)
 
 	dc->writeback_rate_proportional = proportional_scaled;
 	dc->writeback_rate_integral_scaled = integral_scaled;
-	dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
-	dc->writeback_rate.rate = new_rate;
+	dc->writeback_rate_change = new_rate -
+			atomic_long_read(&dc->writeback_rate.rate);
+	atomic_long_set(&dc->writeback_rate.rate, new_rate);
 	dc->writeback_rate_target = target;
 }
 
+static bool set_at_max_writeback_rate(struct cache_set *c,
+				       struct cached_dev *dc)
+{
+	/*
+	 * Idle_counter is increased everytime when update_writeback_rate() is
+	 * called. If all backing devices attached to the same cache set have
+	 * identical dc->writeback_rate_update_seconds values, it is about 6
+	 * rounds of update_writeback_rate() on each backing device before
+	 * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
+	 * to each dc->writeback_rate.rate.
+	 * In order to avoid extra locking cost for counting exact dirty cached
+	 * devices number, c->attached_dev_nr is used to calculate the idle
+	 * throushold. It might be bigger if not all cached device are in write-
+	 * back mode, but it still works well with limited extra rounds of
+	 * update_writeback_rate().
+	 */
+	if (atomic_inc_return(&c->idle_counter) <
+	    atomic_read(&c->attached_dev_nr) * 6)
+		return false;
+
+	if (atomic_read(&c->at_max_writeback_rate) != 1)
+		atomic_set(&c->at_max_writeback_rate, 1);
+
+	atomic_long_set(&dc->writeback_rate.rate, INT_MAX);
+
+	/* keep writeback_rate_target as existing value */
+	dc->writeback_rate_proportional = 0;
+	dc->writeback_rate_integral_scaled = 0;
+	dc->writeback_rate_change = 0;
+
+	/*
+	 * Check c->idle_counter and c->at_max_writeback_rate agagain in case
+	 * new I/O arrives during before set_at_max_writeback_rate() returns.
+	 * Then the writeback rate is set to 1, and its new value should be
+	 * decided via __update_writeback_rate().
+	 */
+	if ((atomic_read(&c->idle_counter) <
+	     atomic_read(&c->attached_dev_nr) * 6) ||
+	    !atomic_read(&c->at_max_writeback_rate))
+		return false;
+
+	return true;
+}
+
 static void update_writeback_rate(struct work_struct *work)
 {
 	struct cached_dev *dc = container_of(to_delayed_work(work),
@@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work)
 		return;
 	}
 
-	down_read(&dc->writeback_lock);
-
-	if (atomic_read(&dc->has_dirty) &&
-	    dc->writeback_percent)
-		__update_writeback_rate(dc);
+	if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
+		/*
+		 * If the whole cache set is idle, set_at_max_writeback_rate()
+		 * will set writeback rate to a max number. Then it is
+		 * unncessary to update writeback rate for an idle cache set
+		 * in maximum writeback rate number(s).
+		 */
+		if (!set_at_max_writeback_rate(c, dc)) {
+			down_read(&dc->writeback_lock);
+			__update_writeback_rate(dc);
+			up_read(&dc->writeback_lock);
+		}
+	}
 
-	up_read(&dc->writeback_lock);
 
 	/*
 	 * CACHE_SET_IO_DISABLE might be set via sysfs interface,
@@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc)
 
 		delay = writeback_delay(dc, size);
 
-		/* If the control system would wait for at least half a
-		 * second, and there's been no reqs hitting the backing disk
-		 * for awhile: use an alternate mode where we have at most
-		 * one contiguous set of writebacks in flight at a time.  If
-		 * someone wants to do IO it will be quick, as it will only
-		 * have to contend with one operation in flight, and we'll
-		 * be round-tripping data to the backing disk as quickly as
-		 * it can accept it.
-		 */
-		if (delay >= HZ / 2) {
-			/* 3 means at least 1.5 seconds, up to 7.5 if we
-			 * have slowed way down.
-			 */
-			if (atomic_inc_return(&dc->backing_idle) >= 3) {
-				/* Wait for current I/Os to finish */
-				closure_sync(&cl);
-				/* And immediately launch a new set. */
-				delay = 0;
-			}
-		}
-
 		while (!kthread_should_stop() &&
 		       !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
 		       delay) {
@@ -476,6 +507,9 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 	if (!d)
 		return;
 
+	if (UUID_FLASH_ONLY(&c->uuids[inode]))
+		atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
+
 	stripe = offset_to_stripe(d, offset);
 	stripe_offset = offset & (d->stripe_size - 1);
 
@@ -673,10 +707,14 @@ static int bch_writeback_thread(void *arg)
 }
 
 /* Init */
+#define INIT_KEYS_EACH_TIME	500000
+#define INIT_KEYS_SLEEP_MS	100
 
 struct sectors_dirty_init {
 	struct btree_op	op;
 	unsigned	inode;
+	size_t		count;
+	struct bkey	start;
 };
 
 static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
@@ -691,18 +729,37 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
 		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
 					     KEY_START(k), KEY_SIZE(k));
 
+	op->count++;
+	if (atomic_read(&b->c->search_inflight) &&
+	    !(op->count % INIT_KEYS_EACH_TIME)) {
+		bkey_copy_key(&op->start, k);
+		return -EAGAIN;
+	}
+
 	return MAP_CONTINUE;
 }
 
 void bch_sectors_dirty_init(struct bcache_device *d)
 {
 	struct sectors_dirty_init op;
+	int ret;
 
 	bch_btree_op_init(&op.op, -1);
 	op.inode = d->id;
-
-	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
-			   sectors_dirty_init_fn, 0);
+	op.count = 0;
+	op.start = KEY(op.inode, 0, 0);
+
+	do {
+		ret = bch_btree_map_keys(&op.op, d->c, &op.start,
+					 sectors_dirty_init_fn, 0);
+		if (ret == -EAGAIN)
+			schedule_timeout_interruptible(
+				msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
+		else if (ret < 0) {
+			pr_warn("sectors dirty init failed, ret=%d!", ret);
+			break;
+		}
+	} while (ret == -EAGAIN);
 }
 
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -715,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_running		= true;
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
-	dc->writeback_rate.rate		= 1024;
+	atomic_long_set(&dc->writeback_rate.rate, 1024);
 	dc->writeback_rate_minimum	= 8;
 
 	dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 610fb01de629..3745d7004c47 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -28,25 +28,6 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
 	return ret;
 }
 
-static inline uint64_t  bcache_flash_devs_sectors_dirty(struct cache_set *c)
-{
-	uint64_t i, ret = 0;
-
-	mutex_lock(&bch_register_lock);
-
-	for (i = 0; i < c->devices_max_used; i++) {
-		struct bcache_device *d = c->devices[i];
-
-		if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
-			continue;
-		ret += bcache_dev_sectors_dirty(d);
-	}
-
-	mutex_unlock(&bch_register_lock);
-
-	return ret;
-}
-
 static inline unsigned offset_to_stripe(struct bcache_device *d,
 					uint64_t offset)
 {
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index ab13fcec3fca..75df4c9d8b54 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -588,7 +588,7 @@ static const char *raid10_md_layout_to_format(int layout)
 }
 
 /* Return md raid10 algorithm for @name */
-static const int raid10_name_to_format(const char *name)
+static int raid10_name_to_format(const char *name)
 {
 	if (!strcasecmp(name, "near"))
 		return ALGORITHM_RAID10_NEAR;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 938766794c2e..3d0e2c198f06 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -885,9 +885,7 @@ EXPORT_SYMBOL_GPL(dm_table_set_type);
 static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
 			       sector_t start, sector_t len, void *data)
 {
-	struct request_queue *q = bdev_get_queue(dev->bdev);
-
-	return q && blk_queue_dax(q);
+	return bdev_dax_supported(dev->bdev, PAGE_SIZE);
 }
 
 static bool dm_table_supports_dax(struct dm_table *t)
@@ -1907,6 +1905,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 
 	if (dm_table_supports_dax(t))
 		blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+	else
+		blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
+
 	if (dm_table_supports_dax_write_cache(t))
 		dax_write_cache(t->md->dax_dev, true);
 
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 36ef284ad086..72142021b5c9 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -776,7 +776,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
 static int __commit_transaction(struct dm_pool_metadata *pmd)
 {
 	int r;
-	size_t metadata_len, data_len;
 	struct thin_disk_superblock *disk_super;
 	struct dm_block *sblock;
 
@@ -797,14 +796,6 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
 	if (r < 0)
 		return r;
 
-	r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
-	if (r < 0)
-		return r;
-
-	r = dm_sm_root_size(pmd->data_sm, &data_len);
-	if (r < 0)
-		return r;
-
 	r = save_sm_roots(pmd);
 	if (r < 0)
 		return r;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 7945238df1c0..b900723bbd0f 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1386,6 +1386,8 @@ static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
 
 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
 
+static void requeue_bios(struct pool *pool);
+
 static void check_for_space(struct pool *pool)
 {
 	int r;
@@ -1398,8 +1400,10 @@ static void check_for_space(struct pool *pool)
 	if (r)
 		return;
 
-	if (nr_free)
+	if (nr_free) {
 		set_pool_mode(pool, PM_WRITE);
+		requeue_bios(pool);
+	}
 }
 
 /*
@@ -1476,7 +1480,10 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 
 	r = dm_pool_alloc_data_block(pool->pmd, result);
 	if (r) {
-		metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
+		if (r == -ENOSPC)
+			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
+		else
+			metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
 		return r;
 	}
 
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 5961c7794ef3..87107c995cb5 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -136,6 +136,7 @@ struct dm_writecache {
 	struct dm_target *ti;
 	struct dm_dev *dev;
 	struct dm_dev *ssd_dev;
+	sector_t start_sector;
 	void *memory_map;
 	uint64_t memory_map_size;
 	size_t metadata_sectors;
@@ -259,7 +260,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
 	if (da != p) {
 		long i;
 		wc->memory_map = NULL;
-		pages = kvmalloc(p * sizeof(struct page *), GFP_KERNEL);
+		pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
 		if (!pages) {
 			r = -ENOMEM;
 			goto err2;
@@ -293,6 +294,10 @@ static int persistent_memory_claim(struct dm_writecache *wc)
 	}
 
 	dax_read_unlock(id);
+
+	wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
+	wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
+
 	return 0;
 err3:
 	kvfree(pages);
@@ -311,7 +316,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
 static void persistent_memory_release(struct dm_writecache *wc)
 {
 	if (wc->memory_vmapped)
-		vunmap(wc->memory_map);
+		vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
 }
 
 static struct page *persistent_memory_page(void *addr)
@@ -359,7 +364,7 @@ static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
 
 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
 {
-	return wc->metadata_sectors +
+	return wc->start_sector + wc->metadata_sectors +
 		((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
 }
 
@@ -471,6 +476,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc)
 		if (unlikely(region.sector + region.count > wc->metadata_sectors))
 			region.count = wc->metadata_sectors - region.sector;
 
+		region.sector += wc->start_sector;
 		atomic_inc(&endio.count);
 		req.bi_op = REQ_OP_WRITE;
 		req.bi_op_flags = REQ_SYNC;
@@ -859,7 +865,7 @@ static int writecache_alloc_entries(struct dm_writecache *wc)
 
 	if (wc->entries)
 		return 0;
-	wc->entries = vmalloc(sizeof(struct wc_entry) * wc->n_blocks);
+	wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
 	if (!wc->entries)
 		return -ENOMEM;
 	for (b = 0; b < wc->n_blocks; b++) {
@@ -1481,9 +1487,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
 		wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
 		wb->page_offset = PAGE_SIZE;
 		if (max_pages <= WB_LIST_INLINE ||
-		    unlikely(!(wb->wc_list = kmalloc(max_pages * sizeof(struct wc_entry *),
-						     GFP_NOIO | __GFP_NORETRY |
-						     __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
+		    unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
+							   GFP_NOIO | __GFP_NORETRY |
+							   __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
 			wb->wc_list = wb->wc_list_inline;
 			max_pages = WB_LIST_INLINE;
 		}
@@ -1946,14 +1952,6 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 	wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
 
-	if (WC_MODE_PMEM(wc)) {
-		r = persistent_memory_claim(wc);
-		if (r) {
-			ti->error = "Unable to map persistent memory for cache";
-			goto bad;
-		}
-	}
-
 	/*
 	 * Parse the cache block size
 	 */
@@ -1982,7 +1980,16 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	while (opt_params) {
 		string = dm_shift_arg(&as), opt_params--;
-		if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
+		if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
+			unsigned long long start_sector;
+			string = dm_shift_arg(&as), opt_params--;
+			if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
+				goto invalid_optional;
+			wc->start_sector = start_sector;
+			if (wc->start_sector != start_sector ||
+			    wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
+				goto invalid_optional;
+		} else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
 			string = dm_shift_arg(&as), opt_params--;
 			if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
 				goto invalid_optional;
@@ -2039,12 +2046,20 @@ invalid_optional:
 		goto bad;
 	}
 
-	if (!WC_MODE_PMEM(wc)) {
+	if (WC_MODE_PMEM(wc)) {
+		r = persistent_memory_claim(wc);
+		if (r) {
+			ti->error = "Unable to map persistent memory for cache";
+			goto bad;
+		}
+	} else {
 		struct dm_io_region region;
 		struct dm_io_request req;
 		size_t n_blocks, n_metadata_blocks;
 		uint64_t n_bitmap_bits;
 
+		wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
+
 		bio_list_init(&wc->flush_list);
 		wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
 		if (IS_ERR(wc->flush_thread)) {
@@ -2097,7 +2112,7 @@ invalid_optional:
 		}
 
 		region.bdev = wc->ssd_dev->bdev;
-		region.sector = 0;
+		region.sector = wc->start_sector;
 		region.count = wc->metadata_sectors;
 		req.bi_op = REQ_OP_READ;
 		req.bi_op_flags = REQ_SYNC;
@@ -2265,7 +2280,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
 
 static struct target_type writecache_target = {
 	.name			= "writecache",
-	.version		= {1, 0, 0},
+	.version		= {1, 1, 0},
 	.module			= THIS_MODULE,
 	.ctr			= writecache_ctr,
 	.dtr			= writecache_dtr,
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 3c0e45f4dcf5..a44183ff4be0 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -787,7 +787,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	/* Chunk BIO work */
 	mutex_init(&dmz->chunk_lock);
-	INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_KERNEL);
+	INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
 	dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
 					0, dev->name);
 	if (!dmz->chunk_wq) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e65429a29c06..20f7e4ef5342 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io)
 
 	io->start_time = jiffies;
 
-	generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0);
+	generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
+			      &dm_disk(md)->part0);
 
 	atomic_set(&dm_disk(md)->part0.in_flight[rw],
 		   atomic_inc_return(&md->pending[rw]));
@@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io)
 	int pending;
 	int rw = bio_data_dir(bio);
 
-	generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
+	generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
+			    io->start_time);
 
 	if (unlikely(dm_stats_used(&md->stats)))
 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@ -1056,8 +1058,7 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (len < 1)
 		goto out;
 	nr_pages = min(len, nr_pages);
-	if (ti->type->direct_access)
-		ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
+	ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
 
  out:
 	dm_put_live_table(md, srcu_idx);
@@ -1606,10 +1607,9 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
 				 * the usage of io->orig_bio in dm_remap_zone_report()
 				 * won't be affected by this reassignment.
 				 */
-				struct bio *b = bio_clone_bioset(bio, GFP_NOIO,
-								 &md->queue->bio_split);
+				struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
+							  GFP_NOIO, &md->queue->bio_split);
 				ci.io->orig_bio = b;
-				bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9);
 				bio_chain(b, bio);
 				ret = generic_make_request(bio);
 				break;
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 021cbf9ef1bf..e8a74e92be30 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -304,15 +304,6 @@ static void recover_bitmaps(struct md_thread *thread)
 	while (cinfo->recovery_map) {
 		slot = fls64((u64)cinfo->recovery_map) - 1;
 
-		/* Clear suspend_area associated with the bitmap */
-		spin_lock_irq(&cinfo->suspend_lock);
-		list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
-			if (slot == s->slot) {
-				list_del(&s->list);
-				kfree(s);
-			}
-		spin_unlock_irq(&cinfo->suspend_lock);
-
 		snprintf(str, 64, "bitmap%04d", slot);
 		bm_lockres = lockres_init(mddev, str, NULL, 1);
 		if (!bm_lockres) {
@@ -331,14 +322,30 @@ static void recover_bitmaps(struct md_thread *thread)
 			pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
 			goto clear_bit;
 		}
+
+		/* Clear suspend_area associated with the bitmap */
+		spin_lock_irq(&cinfo->suspend_lock);
+		list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
+			if (slot == s->slot) {
+				list_del(&s->list);
+				kfree(s);
+			}
+		spin_unlock_irq(&cinfo->suspend_lock);
+
 		if (hi > 0) {
 			if (lo < mddev->recovery_cp)
 				mddev->recovery_cp = lo;
 			/* wake up thread to continue resync in case resync
 			 * is not finished */
 			if (mddev->recovery_cp != MaxSector) {
-			    set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-			    md_wakeup_thread(mddev->thread);
+				/*
+				 * clear the REMOTE flag since we will launch
+				 * resync thread in current node.
+				 */
+				clear_bit(MD_RESYNCING_REMOTE,
+					  &mddev->recovery);
+				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+				md_wakeup_thread(mddev->thread);
 			}
 		}
 clear_bit:
@@ -457,6 +464,11 @@ static void process_suspend_info(struct mddev *mddev,
 	struct suspend_info *s;
 
 	if (!hi) {
+		/*
+		 * clear the REMOTE flag since resync or recovery is finished
+		 * in remote node.
+		 */
+		clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
 		remove_suspend_info(mddev, slot);
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		md_wakeup_thread(mddev->thread);
@@ -585,6 +597,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 		revalidate_disk(mddev->gendisk);
 		break;
 	case RESYNCING:
+		set_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
 		process_suspend_info(mddev, le32_to_cpu(msg->slot),
 				     le64_to_cpu(msg->low),
 				     le64_to_cpu(msg->high));
@@ -1265,8 +1278,18 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 static int resync_finish(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
 	dlm_unlock_sync(cinfo->resync_lockres);
-	return resync_info_update(mddev, 0, 0);
+
+	/*
+	 * If resync thread is interrupted so we can't say resync is finished,
+	 * another node will launch resync thread to continue.
+	 */
+	if (test_bit(MD_CLOSING, &mddev->flags))
+		return 0;
+	else
+		return resync_info_update(mddev, 0, 0);
 }
 
 static int area_resyncing(struct mddev *mddev, int direction,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 29b0cd9ec951..724def2f9eaa 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -204,10 +204,6 @@ static int start_readonly;
  */
 static bool create_on_open = true;
 
-/* bio_clone_mddev
- * like bio_clone_bioset, but with a local bio set
- */
-
 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 			    struct mddev *mddev)
 {
@@ -335,6 +331,7 @@ EXPORT_SYMBOL(md_handle_request);
 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int rw = bio_data_dir(bio);
+	const int sgrp = op_stat_group(bio_op(bio));
 	struct mddev *mddev = q->queuedata;
 	unsigned int sectors;
 	int cpu;
@@ -363,8 +360,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 	md_handle_request(mddev, bio);
 
 	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
 	part_stat_unlock();
 
 	return BLK_QC_T_NONE;
@@ -5547,7 +5544,8 @@ int md_run(struct mddev *mddev)
 		else
 			pr_warn("md: personality for level %s is not loaded!\n",
 				mddev->clevel);
-		return -EINVAL;
+		err = -EINVAL;
+		goto abort;
 	}
 	spin_unlock(&pers_lock);
 	if (mddev->level != pers->level) {
@@ -5560,7 +5558,8 @@ int md_run(struct mddev *mddev)
 	    pers->start_reshape == NULL) {
 		/* This personality cannot handle reshaping... */
 		module_put(pers->owner);
-		return -EINVAL;
+		err = -EINVAL;
+		goto abort;
 	}
 
 	if (pers->sync_request) {
@@ -5629,7 +5628,7 @@ int md_run(struct mddev *mddev)
 		mddev->private = NULL;
 		module_put(pers->owner);
 		bitmap_destroy(mddev);
-		return err;
+		goto abort;
 	}
 	if (mddev->queue) {
 		bool nonrot = true;
@@ -7678,6 +7677,23 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
 		resync -= atomic_read(&mddev->recovery_active);
 
 	if (resync == 0) {
+		if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
+			struct md_rdev *rdev;
+
+			rdev_for_each(rdev, mddev)
+				if (rdev->raid_disk >= 0 &&
+				    !test_bit(Faulty, &rdev->flags) &&
+				    rdev->recovery_offset != MaxSector &&
+				    rdev->recovery_offset) {
+					seq_printf(seq, "\trecover=REMOTE");
+					return 1;
+				}
+			if (mddev->reshape_position != MaxSector)
+				seq_printf(seq, "\treshape=REMOTE");
+			else
+				seq_printf(seq, "\tresync=REMOTE");
+			return 1;
+		}
 		if (mddev->recovery_cp < MaxSector) {
 			seq_printf(seq, "\tresync=PENDING");
 			return 1;
@@ -8044,8 +8060,7 @@ static int is_mddev_idle(struct mddev *mddev, int init)
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
-		curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
-			      (int)part_stat_read(&disk->part0, sectors[1]) -
+		curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
 			      atomic_read(&disk->sync_io);
 		/* sync IO will cause sync_io to increase before the disk_stats
 		 * as sync_io is counted when a request starts, and
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2d148bdaba74..8afd6bfdbfb9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -496,6 +496,7 @@ enum recovery_flags {
 	MD_RECOVERY_FROZEN,	/* User request to abort, and not restart, any action */
 	MD_RECOVERY_ERROR,	/* sync-action interrupted because io-error */
 	MD_RECOVERY_WAIT,	/* waiting for pers->start() to finish */
+	MD_RESYNCING_REMOTE,	/* remote node is running resync thread */
 };
 
 static inline int __must_check mddev_lock(struct mddev *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 478cf446827f..35bd3a62451b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3893,6 +3893,13 @@ static int raid10_run(struct mddev *mddev)
 			    disk->rdev->saved_raid_disk < 0)
 				conf->fullsync = 1;
 		}
+
+		if (disk->replacement &&
+		    !test_bit(In_sync, &disk->replacement->flags) &&
+		    disk->replacement->saved_raid_disk < 0) {
+			conf->fullsync = 1;
+		}
+
 		disk->recovery_disabled = mddev->recovery_disabled - 1;
 	}
 
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 2b775abf377b..7416db70c6cc 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -717,7 +717,6 @@ static void r5c_disable_writeback_async(struct work_struct *work)
 static void r5l_submit_current_io(struct r5l_log *log)
 {
 	struct r5l_io_unit *io = log->current_io;
-	struct bio *bio;
 	struct r5l_meta_block *block;
 	unsigned long flags;
 	u32 crc;
@@ -730,7 +729,6 @@ static void r5l_submit_current_io(struct r5l_log *log)
 	block->meta_size = cpu_to_le32(io->meta_offset);
 	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
 	block->checksum = cpu_to_le32(crc);
-	bio = io->current_bio;
 
 	log->current_io = NULL;
 	spin_lock_irqsave(&log->io_list_lock, flags);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2031506a0ecd..81eaa221216c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -409,16 +409,14 @@ void raid5_release_stripe(struct stripe_head *sh)
 		md_wakeup_thread(conf->mddev->thread);
 	return;
 slow_path:
-	local_irq_save(flags);
 	/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
-	if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
+	if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
 		INIT_LIST_HEAD(&list);
 		hash = sh->hash_lock_index;
 		do_release_stripe(conf, sh, &list);
-		spin_unlock(&conf->device_lock);
+		spin_unlock_irqrestore(&conf->device_lock, flags);
 		release_inactive_stripe_list(conf, &list, hash);
 	}
-	local_irq_restore(flags);
 }
 
 static inline void remove_hash(struct stripe_head *sh)
@@ -4521,6 +4519,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			s->failed++;
 			if (rdev && !test_bit(Faulty, &rdev->flags))
 				do_recovery = 1;
+			else if (!rdev) {
+				rdev = rcu_dereference(
+				    conf->disks[i].replacement);
+				if (rdev && !test_bit(Faulty, &rdev->flags))
+					do_recovery = 1;
+			}
 		}
 
 		if (test_bit(R5_InJournal, &dev->flags))