25 files changed, 521 insertions, 306 deletions
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 91e0aa796e6b..83c9860e6b82 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -85,7 +85,7 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
 	six_unlock_intent(&b->c.lock);
 }
 
-static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
+void __btree_node_data_free(struct btree *b)
 {
 	BUG_ON(!list_empty(&b->list));
 	BUG_ON(btree_node_hashed(b));
@@ -112,16 +112,17 @@ static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
 	munmap(b->aux_data, btree_aux_data_bytes(b));
 #endif
 	b->aux_data = NULL;
-
-	btree_node_to_freedlist(bc, b);
 }
 
 static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
 {
 	BUG_ON(list_empty(&b->list));
 	list_del_init(&b->list);
+
+	__btree_node_data_free(b);
+
 	--bc->nr_freeable;
-	__btree_node_data_free(bc, b);
+	btree_node_to_freedlist(bc, b);
 }
 
 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -185,10 +186,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 {
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-
-	b = __btree_node_mem_alloc(c, GFP_KERNEL);
+	struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
 	if (!b)
 		return NULL;
 
@@ -198,8 +196,6 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 	}
 
 	bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
-
-	__bch2_btree_node_to_freelist(bc, b);
 	return b;
 }
 
@@ -524,7 +520,8 @@ restart:
 			--touched;;
 		} else if (!btree_node_reclaim(c, b)) {
 			__bch2_btree_node_hash_remove(bc, b);
-			__btree_node_data_free(bc, b);
+			__btree_node_data_free(b);
+			btree_node_to_freedlist(bc, b);
 
 			freed++;
 			bc->nr_freed++;
@@ -652,9 +649,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 
 	bch2_recalc_btree_reserve(c);
 
-	for (i = 0; i < bc->nr_reserve; i++)
-		if (!__bch2_btree_node_mem_alloc(c))
+	for (i = 0; i < bc->nr_reserve; i++) {
+		struct btree *b = __bch2_btree_node_mem_alloc(c);
+		if (!b)
 			goto err;
+		__bch2_btree_node_to_freelist(bc, b);
+	}
 
 	list_splice_init(&bc->live[0].list, &bc->freeable);
 
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index ca3c1b145330..be275f87a60e 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -30,6 +30,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsig
 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
 int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
 
+void __btree_node_data_free(struct btree *);
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index e874a4357f64..a4cc72986e36 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -568,9 +568,9 @@ static int __btree_err(int ret,
 		bch2_mark_btree_validate_failure(failed, ca->dev_idx);
 
 		struct extent_ptr_decoded pick;
-		have_retry = !bch2_bkey_pick_read_device(c,
+		have_retry = bch2_bkey_pick_read_device(c,
 					bkey_i_to_s_c(&b->key),
-					failed, &pick, -1);
+					failed, &pick, -1) == 1;
 	}
 
 	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
@@ -615,7 +615,6 @@ static int __btree_err(int ret,
 			goto out;
 		case -BCH_ERR_btree_node_read_err_bad_node:
 			prt_str(&out, ", ");
-			ret = __bch2_topology_error(c, &out);
 			break;
 		}
 
@@ -644,7 +643,6 @@ static int __btree_err(int ret,
 		goto out;
 	case -BCH_ERR_btree_node_read_err_bad_node:
 		prt_str(&out, ", ");
-		ret = __bch2_topology_error(c, &out);
 		break;
 	}
 print:
@@ -1408,7 +1406,7 @@ static void btree_node_read_work(struct work_struct *work)
 		ret = bch2_bkey_pick_read_device(c,
 					bkey_i_to_s_c(&b->key),
 					&failed, &rb->pick, -1);
-		if (ret) {
+		if (ret <= 0) {
 			set_btree_node_read_error(b);
 			break;
 		}
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 23d8c62ea4b6..42c9eb2c786e 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -75,39 +75,6 @@ static inline u64 bkey_journal_seq(struct bkey_s_c k)
 	}
 }
 
-static bool found_btree_node_is_readable(struct btree_trans *trans,
-					 struct found_btree_node *f)
-{
-	struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
-
-	found_btree_node_to_key(&tmp.k, f);
-
-	struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
-	bool ret = !IS_ERR_OR_NULL(b);
-	if (!ret)
-		return ret;
-
-	f->sectors_written = b->written;
-	f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
-
-	struct bkey_s_c k;
-	struct bkey unpacked;
-	struct btree_node_iter iter;
-	for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
-		f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
-
-	six_unlock_read(&b->c.lock);
-
-	/*
-	 * We might update this node's range; if that happens, we need the node
-	 * to be re-read so the read path can trim keys that are no longer in
-	 * this node
-	 */
-	if (b != btree_node_root(trans->c, b))
-		bch2_btree_node_evict(trans, &tmp.k);
-	return ret;
-}
-
 static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
 {
 	const struct found_btree_node *l = _l;
@@ -159,17 +126,17 @@ static const struct min_heap_callbacks found_btree_node_heap_cbs = {
 };
 
 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
-				struct bio *bio, struct btree_node *bn, u64 offset)
+				struct btree *b, struct bio *bio, u64 offset)
 {
 	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+	struct btree_node *bn = b->data;
 
 	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
 	bio->bi_iter.bi_sector	= offset;
-	bch2_bio_map(bio, bn, PAGE_SIZE);
+	bch2_bio_map(bio, b->data, c->opts.block_size);
 
 	u64 submit_time = local_clock();
 	submit_bio_wait(bio);
-
 	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
 
 	if (bio->bi_status) {
@@ -201,6 +168,14 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
 	if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
 		return;
 
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+	bio->bi_iter.bi_sector	= offset;
+	bch2_bio_map(bio, b->data, c->opts.btree_node_size);
+
+	submit_time = local_clock();
+	submit_bio_wait(bio);
+	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
+
 	rcu_read_lock();
 	struct found_btree_node n = {
 		.btree_id	= BTREE_NODE_ID(bn),
@@ -217,7 +192,20 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
 	};
 	rcu_read_unlock();
 
-	if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+	found_btree_node_to_key(&b->key, &n);
+
+	CLASS(printbuf, buf)();
+	if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) {
+		/* read_done will swap out b->data for another buffer */
+		bn = b->data;
+		/*
+		 * Grab journal_seq here because we want the max journal_seq of
+		 * any bset; read_done sorts down to a single set and picks the
+		 * max journal_seq
+		 */
+		n.journal_seq		= le64_to_cpu(bn->keys.journal_seq),
+		n.sectors_written	= b->written;
+
 		mutex_lock(&f->lock);
 		if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
 			bch_err(c, "try_read_btree_node() can't handle endian conversion");
@@ -237,12 +225,20 @@ static int read_btree_nodes_worker(void *p)
 	struct find_btree_nodes_worker *w = p;
 	struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
 	struct bch_dev *ca = w->ca;
-	void *buf = (void *) __get_free_page(GFP_KERNEL);
-	struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
 	unsigned long last_print = jiffies;
+	struct btree *b = NULL;
+	struct bio *bio = NULL;
+
+	b = __bch2_btree_node_mem_alloc(c);
+	if (!b) {
+		bch_err(c, "read_btree_nodes_worker: error allocating buf");
+		w->f->ret = -ENOMEM;
+		goto err;
+	}
 
-	if (!buf || !bio) {
-		bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+	bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL);
+	if (!bio) {
+		bch_err(c, "read_btree_nodes_worker: error allocating bio");
 		w->f->ret = -ENOMEM;
 		goto err;
 	}
@@ -266,11 +262,13 @@ static int read_btree_nodes_worker(void *p)
 			    !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
 				continue;
 
-			try_read_btree_node(w->f, ca, bio, buf, sector);
+			try_read_btree_node(w->f, ca, b, bio, sector);
 		}
 err:
+	if (b)
+		__btree_node_data_free(b);
+	kfree(b);
 	bio_put(bio);
-	free_page((unsigned long) buf);
 	enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
 	closure_put(w->cl);
 	kfree(w);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 901f643ead83..07c2a0f73cc2 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -153,8 +153,6 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 		c->verify_data = __bch2_btree_node_mem_alloc(c);
 		if (!c->verify_data)
 			goto out;
-
-		list_del_init(&c->verify_data->list);
 	}
 
 	BUG_ON(b->nsets != 1);
@@ -586,6 +584,8 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 	i->ubuf = buf;
 	i->size	= size;
 	i->ret	= 0;
+
+	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 restart:
 	seqmutex_lock(&c->btree_trans_lock);
 	list_sort(&c->btree_trans_list, list_ptr_order_cmp);
@@ -599,6 +599,11 @@ restart:
 		if (!closure_get_not_zero(&trans->ref))
 			continue;
 
+		if (!trans->srcu_held) {
+			closure_put(&trans->ref);
+			continue;
+		}
+
 		u32 seq = seqmutex_unlock(&c->btree_trans_lock);
 
 		bch2_btree_trans_to_text(&i->buf, trans);
@@ -620,6 +625,8 @@ restart:
 	}
 	seqmutex_unlock(&c->btree_trans_lock);
 unlocked:
+	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
 	if (i->buf.allocation_failure)
 		ret = -ENOMEM;
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 86a842f1e88e..acc3b7b67704 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -282,7 +282,6 @@
 	x(EIO,				sb_not_downgraded)			\
 	x(EIO,				btree_node_write_all_failed)		\
 	x(EIO,				btree_node_read_error)			\
-	x(EIO,				btree_node_read_validate_error)		\
 	x(EIO,				btree_need_topology_repair)		\
 	x(EIO,				bucket_ref_update)			\
 	x(EIO,				trigger_alloc)				\
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index b2a6c041e165..267e73d9d7e6 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -103,7 +103,7 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
 		return bch_err_throw(c, btree_need_topology_repair);
 	} else {
 		return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?:
-			bch_err_throw(c, btree_node_read_validate_error);
+			bch_err_throw(c, btree_need_topology_repair);
 	}
 }
 
@@ -633,7 +633,9 @@ err:
 	 * log_fsck_err()s: that would require us to track for every error type
 	 * which recovery pass corrects it, to get the fsck exit status correct:
 	 */
-	if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) {
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		/* nothing */
+	} else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) {
 		set_bit(BCH_FS_errors_fixed, &c->flags);
 	} else {
 		set_bit(BCH_FS_errors_not_fixed, &c->flags);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 036e4ad95987..83cbd77dcb9c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -50,19 +50,17 @@ void bch2_io_failures_to_text(struct printbuf *out,
 			      struct bch_io_failures *failed)
 {
 	static const char * const error_types[] = {
-		"io", "checksum", "ec reconstruct", NULL
+		"btree validate", "io", "checksum", "ec reconstruct", NULL
 	};
 
 	for (struct bch_dev_io_failures *f = failed->devs;
 	     f < failed->devs + failed->nr;
 	     f++) {
 		unsigned errflags =
-			((!!f->failed_io)	<< 0) |
-			((!!f->failed_csum_nr)	<< 1) |
-			((!!f->failed_ec)	<< 2);
-
-		if (!errflags)
-			continue;
+			((!!f->failed_btree_validate)	<< 0) |
+			((!!f->failed_io)		<< 1) |
+			((!!f->failed_csum_nr)		<< 2) |
+			((!!f->failed_ec)		<< 3);
 
 		bch2_printbuf_make_room(out, 1024);
 		out->atomic++;
@@ -77,7 +75,9 @@ void bch2_io_failures_to_text(struct printbuf *out,
 
 		prt_char(out, ' ');
 
-		if (is_power_of_2(errflags)) {
+		if (!errflags) {
+			prt_str(out, "no error - confused");
+		} else if (is_power_of_2(errflags)) {
 			prt_bitflags(out, error_types, errflags);
 			prt_str(out, " error");
 		} else {
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index dbf161e4311a..856eb2b41896 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -12,6 +12,7 @@
 #include "fs.h"
 #include "fsck.h"
 #include "inode.h"
+#include "io_misc.h"
 #include "keylist.h"
 #include "namei.h"
 #include "recovery_passes.h"
@@ -1919,33 +1920,11 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 					"extent type past end of inode %llu:%u, i_size %llu\n%s",
 					i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-				struct bkey_i *whiteout = bch2_trans_kmalloc(trans, sizeof(*whiteout));
-				ret = PTR_ERR_OR_ZERO(whiteout);
-				if (ret)
-					goto err;
-
-				bkey_init(&whiteout->k);
-				whiteout->k.p = SPOS(k.k->p.inode,
-						     last_block,
-						     i->inode.bi_snapshot);
-				bch2_key_resize(&whiteout->k,
-						min(KEY_SIZE_MAX & (~0 << c->block_bits),
-						    U64_MAX - whiteout->k.p.offset));
-
-
-				/*
-				 * Need a normal (not BTREE_ITER_all_snapshots)
-				 * iterator, if we're deleting in a different
-				 * snapshot and need to emit a whiteout
-				 */
-				struct btree_iter iter2;
-				bch2_trans_iter_init(trans, &iter2, BTREE_ID_extents,
-						     bkey_start_pos(&whiteout->k),
-						     BTREE_ITER_intent);
-				ret =   bch2_btree_iter_traverse(trans, &iter2) ?:
-					bch2_trans_update(trans, &iter2, whiteout,
-						BTREE_UPDATE_internal_snapshot_node);
-				bch2_trans_iter_exit(trans, &iter2);
+				ret = bch2_fpunch_snapshot(trans,
+							   SPOS(i->inode.bi_inum,
+								last_block,
+								i->inode.bi_snapshot),
+							   POS(i->inode.bi_inum, U64_MAX));
 				if (ret)
 					goto err;
 
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index bf72b1d2e2cb..07023667a475 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -135,6 +135,33 @@ err_noprint:
 	return ret;
 }
 
+/* For fsck */
+int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end)
+{
+	u32 restart_count = trans->restart_count;
+	struct bch_fs *c = trans->c;
+	struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0);
+	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
+	struct bkey_i delete;
+
+	int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
+			start, end, 0, k,
+			&disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+		bkey_init(&delete.k);
+		delete.k.p = iter.pos;
+
+		/* create the biggest key we can */
+		bch2_key_resize(&delete.k, max_sectors);
+		bch2_cut_back(end, &delete);
+
+		bch2_extent_trim_atomic(trans, &iter, &delete) ?:
+		bch2_trans_update(trans, &iter, &delete, 0);
+	}));
+
+	bch2_disk_reservation_put(c, &disk_res);
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
 /*
  * Returns -BCH_ERR_transacton_restart if we had to drop locks:
  */
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
index 9cb44a7c43c1..b93e4d4b3c0c 100644
--- a/fs/bcachefs/io_misc.h
+++ b/fs/bcachefs/io_misc.h
@@ -5,6 +5,8 @@
 int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
 			  u64, struct bch_io_opts, s64 *,
 			  struct write_point_specifier);
+
+int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos);
 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
 		   subvol_inum, u64, s64 *);
 int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index cd6201741c59..0042d43b8e57 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -170,6 +170,12 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
 		return (struct journal_space) { 0, 0 };
 
 	/*
+	 * It's possible for bucket size to be misaligned w.r.t. the filesystem
+	 * block size:
+	 */
+	min_bucket_size = round_down(min_bucket_size, block_sectors(c));
+
+	/*
 	 * We sorted largest to smallest, and we want the smallest out of the
 	 * @nr_devs_want largest devices:
 	 */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d0b7e3a36a54..c94debb12d2f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -273,24 +273,35 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
 		goto out;
 
 	struct btree_path *path = btree_iter_path(trans, &iter);
-	if (unlikely(!btree_path_node(path, k->level) &&
-		     !k->allocated)) {
+	if (unlikely(!btree_path_node(path, k->level))) {
 		struct bch_fs *c = trans->c;
 
+		CLASS(printbuf, buf)();
+		prt_str(&buf, "btree=");
+		bch2_btree_id_to_text(&buf, k->btree_id);
+		prt_printf(&buf, " level=%u ", k->level);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
+
 		if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
 						     BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
-			bch_err(c, "have key in journal replay for btree depth that does not exist, confused");
+			bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s",
+				buf.buf);
 			ret = -EINVAL;
 		}
-#if 0
+
+		if (!k->allocated) {
+			bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s",
+				   buf.buf);
+			k->overwritten = true;
+			goto out;
+		}
+
 		bch2_trans_iter_exit(trans, &iter);
 		bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
 					  BTREE_MAX_DEPTH, 0, iter_flags);
 		ret =   bch2_btree_iter_traverse(trans, &iter) ?:
 			bch2_btree_increase_depth(trans, iter.path, 0) ?:
 			-BCH_ERR_transaction_restart_nested;
-#endif
-		k->overwritten = true;
 		goto out;
 	}
 
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index c09ed2dd4639..6a039e011064 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -360,7 +360,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
 		!(r->passes_complete & BIT_ULL(pass));
 	bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;
 
-	if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) {
+	if (!(flags & RUN_RECOVERY_PASS_nopersistent)) {
 		struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
 		__set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
 	}
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 6a329c329f43..16e4a6bd9b97 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -214,9 +214,11 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 
 /*
  * bit 30: I/O error occurred on this folio
+ * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
  * bit 0 - 29: remaining parts to complete this folio
  */
-#define EROFS_ONLINEFOLIO_EIO			(1 << 30)
+#define EROFS_ONLINEFOLIO_EIO		30
+#define EROFS_ONLINEFOLIO_DIRTY		29
 
 void erofs_onlinefolio_init(struct folio *folio)
 {
@@ -233,19 +235,23 @@ void erofs_onlinefolio_split(struct folio *folio)
 	atomic_inc((atomic_t *)&folio->private);
 }
 
-void erofs_onlinefolio_end(struct folio *folio, int err)
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
 {
 	int orig, v;
 
 	do {
 		orig = atomic_read((atomic_t *)&folio->private);
-		v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0);
+		DBG_BUGON(orig <= 0);
+		v = dirty << EROFS_ONLINEFOLIO_DIRTY;
+		v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO);
 	} while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
 
-	if (v & ~EROFS_ONLINEFOLIO_EIO)
+	if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1))
 		return;
 	folio->private = 0;
-	folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO));
+	if (v & BIT(EROFS_ONLINEFOLIO_DIRTY))
+		flush_dcache_folio(folio);
+	folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO)));
 }
 
 static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@ -351,11 +357,16 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
  */
 static int erofs_read_folio(struct file *file, struct folio *folio)
 {
+	trace_erofs_read_folio(folio, true);
+
 	return iomap_read_folio(folio, &erofs_iomap_ops);
 }
 
 static void erofs_readahead(struct readahead_control *rac)
 {
+	trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
+					readahead_count(rac), true);
+
 	return iomap_readahead(rac, &erofs_iomap_ops);
 }
 
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index bf62e2836b60..358061d7b660 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -301,13 +301,11 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
 		cur = min(cur, rq->outputsize);
 		if (cur && rq->out[0]) {
 			kin = kmap_local_page(rq->in[nrpages_in - 1]);
-			if (rq->out[0] == rq->in[nrpages_in - 1]) {
+			if (rq->out[0] == rq->in[nrpages_in - 1])
 				memmove(kin + rq->pageofs_out, kin + pi, cur);
-				flush_dcache_page(rq->out[0]);
-			} else {
+			else
 				memcpy_to_page(rq->out[0], rq->pageofs_out,
 					       kin + pi, cur);
-			}
 			kunmap_local(kin);
 		}
 		rq->outputsize -= cur;
@@ -325,14 +323,12 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
 			po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK;
 			DBG_BUGON(no >= nrpages_out);
 			cnt = min(insz - pi, PAGE_SIZE - po);
-			if (rq->out[no] == rq->in[ni]) {
+			if (rq->out[no] == rq->in[ni])
 				memmove(kin + po,
 					kin + rq->pageofs_in + pi, cnt);
-				flush_dcache_page(rq->out[no]);
-			} else if (rq->out[no]) {
+			else if (rq->out[no])
 				memcpy_to_page(rq->out[no], po,
 					       kin + rq->pageofs_in + pi, cnt);
-			}
 			pi += cnt;
 		} while (pi < insz);
 		kunmap_local(kin);
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index 2fae209d0274..3e4b38bec0aa 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -58,6 +58,11 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
 		struct erofs_dirent *de;
 		unsigned int nameoff, maxsize;
 
+		if (fatal_signal_pending(current)) {
+			err = -ERESTARTSYS;
+			break;
+		}
+
 		de = erofs_bread(&buf, dbstart, true);
 		if (IS_ERR(de)) {
 			erofs_err(sb, "failed to readdir of logical block %llu of nid %llu",
@@ -88,6 +93,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
 			break;
 		ctx->pos = dbstart + maxsize;
 		ofs = 0;
+		cond_resched();
 	}
 	erofs_put_metabuf(&buf);
 	if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) {
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index df5cc63f2c01..91781718199e 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -38,7 +38,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
 	} else {
 		bio_for_each_folio_all(fi, &rq->bio) {
 			DBG_BUGON(folio_test_uptodate(fi.folio));
-			erofs_onlinefolio_end(fi.folio, ret);
+			erofs_onlinefolio_end(fi.folio, ret, false);
 		}
 	}
 	bio_uninit(&rq->bio);
@@ -96,8 +96,6 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
 	struct erofs_map_blocks *map = &io->map;
 	unsigned int cur = 0, end = folio_size(folio), len, attached = 0;
 	loff_t pos = folio_pos(folio), ofs;
-	struct iov_iter iter;
-	struct bio_vec bv;
 	int err = 0;
 
 	erofs_onlinefolio_init(folio);
@@ -122,13 +120,7 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
 				err = PTR_ERR(src);
 				break;
 			}
-			bvec_set_folio(&bv, folio, len, cur);
-			iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len);
-			if (copy_to_iter(src, len, &iter) != len) {
-				erofs_put_metabuf(&buf);
-				err = -EIO;
-				break;
-			}
+			memcpy_to_folio(folio, cur, src, len);
 			erofs_put_metabuf(&buf);
 		} else if (!(map->m_flags & EROFS_MAP_MAPPED)) {
 			folio_zero_segment(folio, cur, cur + len);
@@ -162,7 +154,7 @@ io_retry:
 		}
 		cur += len;
 	}
-	erofs_onlinefolio_end(folio, err);
+	erofs_onlinefolio_end(folio, err, false);
 	return err;
 }
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index a32c03a80c70..06b867d2fc3b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -315,10 +315,12 @@ static inline struct folio *erofs_grab_folio_nowait(struct address_space *as,
 /* The length of extent is full */
 #define EROFS_MAP_FULL_MAPPED	0x0008
 /* Located in the special packed inode */
-#define EROFS_MAP_FRAGMENT	0x0010
+#define __EROFS_MAP_FRAGMENT	0x0010
 /* The extent refers to partial decompressed data */
 #define EROFS_MAP_PARTIAL_REF	0x0020
 
+#define EROFS_MAP_FRAGMENT	(EROFS_MAP_MAPPED | __EROFS_MAP_FRAGMENT)
+
 struct erofs_map_blocks {
 	struct erofs_buf buf;
 
@@ -390,7 +392,7 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map);
 void erofs_onlinefolio_init(struct folio *folio);
 void erofs_onlinefolio_split(struct folio *folio);
-void erofs_onlinefolio_end(struct folio *folio, int err);
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty);
 struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
 int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
 		  struct kstat *stat, u32 request_mask,
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index fe8071844724..e3f28a1bb945 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1034,7 +1034,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
 		if (!(map->m_flags & EROFS_MAP_MAPPED)) {
 			folio_zero_segment(folio, cur, end);
 			tight = false;
-		} else if (map->m_flags & EROFS_MAP_FRAGMENT) {
+		} else if (map->m_flags & __EROFS_MAP_FRAGMENT) {
 			erofs_off_t fpos = offset + cur - map->m_la;
 
 			err = z_erofs_read_fragment(inode->i_sb, folio, cur,
@@ -1091,7 +1091,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
 			tight = (bs == PAGE_SIZE);
 		}
 	} while ((end = cur) > 0);
-	erofs_onlinefolio_end(folio, err);
+	erofs_onlinefolio_end(folio, err, false);
 	return err;
 }
 
@@ -1196,7 +1196,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
 			cur += len;
 		}
 		kunmap_local(dst);
-		erofs_onlinefolio_end(page_folio(bvi->bvec.page), err);
+		erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true);
 		list_del(p);
 		kfree(bvi);
 	}
@@ -1355,7 +1355,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
 
 		DBG_BUGON(z_erofs_page_is_invalidated(page));
 		if (!z_erofs_is_shortlived_page(page)) {
-			erofs_onlinefolio_end(page_folio(page), err);
+			erofs_onlinefolio_end(page_folio(page), err, true);
 			continue;
 		}
 		if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 0bebc6e3a4d7..f1a15ff22147 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -413,8 +413,7 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
 	    !vi->z_tailextent_headlcn) {
 		map->m_la = 0;
 		map->m_llen = inode->i_size;
-		map->m_flags = EROFS_MAP_MAPPED |
-			EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT;
+		map->m_flags = EROFS_MAP_FRAGMENT;
 		return 0;
 	}
 	initial_lcn = ofs >> lclusterbits;
@@ -489,7 +488,7 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
 			goto unmap_out;
 		}
 	} else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
-		map->m_flags |= EROFS_MAP_FRAGMENT;
+		map->m_flags = EROFS_MAP_FRAGMENT;
 	} else {
 		map->m_pa = erofs_pos(sb, m.pblk);
 		err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
@@ -617,7 +616,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
 	if (lstart < lend) {
 		map->m_la = lstart;
 		if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
-			map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT;
+			map->m_flags = EROFS_MAP_FRAGMENT;
 			vi->z_fragmentoff = map->m_plen;
 			if (recsz > offsetof(struct z_erofs_extent, pstart_lo))
 				vi->z_fragmentoff |= map->m_pa << 32;
@@ -797,7 +796,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
 	iomap->length = map.m_llen;
 	if (map.m_flags & EROFS_MAP_MAPPED) {
 		iomap->type = IOMAP_MAPPED;
-		iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ?
+		iomap->addr = map.m_flags & __EROFS_MAP_FRAGMENT ?
 			      IOMAP_NULL_ADDR : map.m_pa;
 	} else {
 		iomap->type = IOMAP_HOLE;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a97a771a459c..0fbf5dfedb24 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -137,7 +137,13 @@ struct epitem {
 	};
 
 	/* List header used to link this structure to the eventpoll ready list */
-	struct llist_node rdllink;
+	struct list_head rdllink;
+
+	/*
+	 * Works together "struct eventpoll"->ovflist in keeping the
+	 * single linked chain of items.
+	 */
+	struct epitem *next;
 
 	/* The file descriptor information this item refers to */
 	struct epoll_filefd ffd;
@@ -185,15 +191,22 @@ struct eventpoll {
 	/* Wait queue used by file->poll() */
 	wait_queue_head_t poll_wait;
 
-	/*
-	 * List of ready file descriptors. Adding to this list is lockless. Items can be removed
-	 * only with eventpoll::mtx
-	 */
-	struct llist_head rdllist;
+	/* List of ready file descriptors */
+	struct list_head rdllist;
+
+	/* Lock which protects rdllist and ovflist */
+	rwlock_t lock;
 
 	/* RB tree root used to store monitored fd structs */
 	struct rb_root_cached rbr;
 
+	/*
+	 * This is a single linked list that chains all the "struct epitem" that
+	 * happened while transferring ready events to userspace w/out
+	 * holding ->lock.
+	 */
+	struct epitem *ovflist;
+
 	/* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
 	struct wakeup_source *ws;
 
@@ -348,14 +361,10 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1,
 	        (p1->file < p2->file ? -1 : p1->fd - p2->fd));
 }
 
-/*
- * Add the item to its container eventpoll's rdllist; do nothing if the item is already on rdllist.
- */
-static void epitem_ready(struct epitem *epi)
+/* Tells us if the item is currently linked */
+static inline int ep_is_linked(struct epitem *epi)
 {
-	if (&epi->rdllink == cmpxchg(&epi->rdllink.next, &epi->rdllink, NULL))
-		llist_add(&epi->rdllink, &epi->ep->rdllist);
-
+	return !list_empty(&epi->rdllink);
 }
 
 static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
@@ -374,26 +383,13 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
  *
  * @ep: Pointer to the eventpoll context.
  *
- * Return: true if ready events might be available, false otherwise.
+ * Return: a value different than %zero if ready events are available,
+ *          or %zero otherwise.
  */
-static inline bool ep_events_available(struct eventpoll *ep)
+static inline int ep_events_available(struct eventpoll *ep)
 {
-	bool available;
-	int locked;
-
-	locked = mutex_trylock(&ep->mtx);
-	if (!locked) {
-		/*
-		 * The lock held and someone might have removed all items while inspecting it. The
-		 * llist_empty() check in this case is futile. Assume that something is enqueued and
-		 * let ep_try_send_events() figure it out.
-		 */
-		return true;
-	}
-
-	available = !llist_empty(&ep->rdllist);
-	mutex_unlock(&ep->mtx);
-	return available;
+	return !list_empty_careful(&ep->rdllist) ||
+		READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
 }
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
@@ -728,6 +724,77 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
 	rcu_read_unlock();
 }
 
+
+/*
+ * ep->mutex needs to be held because we could be hit by
+ * eventpoll_release_file() and epoll_ctl().
+ */
+static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
+{
+	/*
+	 * Steal the ready list, and re-init the original one to the
+	 * empty list. Also, set ep->ovflist to NULL so that events
+	 * happening while looping w/out locks, are not lost. We cannot
+	 * have the poll callback to queue directly on ep->rdllist,
+	 * because we want the "sproc" callback to be able to do it
+	 * in a lockless way.
+	 */
+	lockdep_assert_irqs_enabled();
+	write_lock_irq(&ep->lock);
+	list_splice_init(&ep->rdllist, txlist);
+	WRITE_ONCE(ep->ovflist, NULL);
+	write_unlock_irq(&ep->lock);
+}
+
+static void ep_done_scan(struct eventpoll *ep,
+			 struct list_head *txlist)
+{
+	struct epitem *epi, *nepi;
+
+	write_lock_irq(&ep->lock);
+	/*
+	 * During the time we spent inside the "sproc" callback, some
+	 * other events might have been queued by the poll callback.
+	 * We re-insert them inside the main ready-list here.
+	 */
+	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
+	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+		/*
+		 * We need to check if the item is already in the list.
+		 * During the "sproc" callback execution time, items are
+		 * queued into ->ovflist but the "txlist" might already
+		 * contain them, and the list_splice() below takes care of them.
+		 */
+		if (!ep_is_linked(epi)) {
+			/*
+			 * ->ovflist is LIFO, so we have to reverse it in order
+			 * to keep in FIFO.
+			 */
+			list_add(&epi->rdllink, &ep->rdllist);
+			ep_pm_stay_awake(epi);
+		}
+	}
+	/*
+	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
+	 * releasing the lock, events will be queued in the normal way inside
+	 * ep->rdllist.
+	 */
+	WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
+
+	/*
+	 * Quickly re-inject items left on "txlist".
+	 */
+	list_splice(txlist, &ep->rdllist);
+	__pm_relax(ep->ws);
+
+	if (!list_empty(&ep->rdllist)) {
+		if (waitqueue_active(&ep->wq))
+			wake_up(&ep->wq);
+	}
+
+	write_unlock_irq(&ep->lock);
+}
+
 static void ep_get(struct eventpoll *ep)
 {
 	refcount_inc(&ep->refcount);
@@ -765,12 +832,10 @@ static void ep_free(struct eventpoll *ep)
 static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 {
 	struct file *file = epi->ffd.file;
-	struct llist_node *put_back_last;
 	struct epitems_head *to_free;
 	struct hlist_head *head;
-	LLIST_HEAD(put_back);
 
-	lockdep_assert_held(&ep->mtx);
+	lockdep_assert_irqs_enabled();
 
 	/*
 	 * Removes poll wait queue hooks.
@@ -802,20 +867,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 
 	rb_erase_cached(&epi->rbn, &ep->rbr);
 
-	if (llist_on_list(&epi->rdllink)) {
-		put_back_last = NULL;
-		while (true) {
-			struct llist_node *n = llist_del_first(&ep->rdllist);
-
-			if (&epi->rdllink == n || WARN_ON(!n))
-				break;
-			if (!put_back_last)
-				put_back_last = n;
-			__llist_add(n, &put_back);
-		}
-		if (put_back_last)
-			llist_add_batch(put_back.first, put_back_last, &ep->rdllist);
-	}
+	write_lock_irq(&ep->lock);
+	if (ep_is_linked(epi))
+		list_del_init(&epi->rdllink);
+	write_unlock_irq(&ep->lock);
 
 	wakeup_source_unregister(ep_wakeup_source(epi));
 	/*
@@ -828,7 +883,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 	kfree_rcu(epi, rcu);
 
 	percpu_counter_dec(&ep->user->epoll_watches);
-	return ep_refcount_dec_and_test(ep);
+	return true;
 }
 
 /*
@@ -836,14 +891,14 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
  */
 static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
 {
-	WARN_ON_ONCE(__ep_remove(ep, epi, false));
+	if (__ep_remove(ep, epi, false))
+		WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
 }
 
 static void ep_clear_and_put(struct eventpoll *ep)
 {
 	struct rb_node *rbp, *next;
 	struct epitem *epi;
-	bool dispose;
 
 	/* We need to release all tasks waiting for these file */
 	if (waitqueue_active(&ep->poll_wait))
@@ -876,10 +931,8 @@ static void ep_clear_and_put(struct eventpoll *ep)
 		cond_resched();
 	}
 
-	dispose = ep_refcount_dec_and_test(ep);
 	mutex_unlock(&ep->mtx);
-
-	if (dispose)
+	if (ep_refcount_dec_and_test(ep))
 		ep_free(ep);
 }
 
@@ -919,9 +972,8 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth
 static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
 {
 	struct eventpoll *ep = file->private_data;
-	struct wakeup_source *ws;
-	struct llist_node *n;
-	struct epitem *epi;
+	LIST_HEAD(txlist);
+	struct epitem *epi, *tmp;
 	poll_table pt;
 	__poll_t res = 0;
 
@@ -935,39 +987,22 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
 	 * the ready list.
 	 */
 	mutex_lock_nested(&ep->mtx, depth);
-	while (true) {
-		n = llist_del_first_init(&ep->rdllist);
-		if (!n)
-			break;
-
-		epi = llist_entry(n, struct epitem, rdllink);
-
+	ep_start_scan(ep, &txlist);
+	list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
 		if (ep_item_poll(epi, &pt, depth + 1)) {
 			res = EPOLLIN | EPOLLRDNORM;
-			epitem_ready(epi);
 			break;
 		} else {
 			/*
-			 * We need to activate ep before deactivating epi, to prevent autosuspend
-			 * just in case epi becomes active after ep_item_poll() above.
-			 *
-			 * This is similar to ep_send_events().
+			 * Item has been dropped into the ready list by the poll
+			 * callback, but it's not actually ready, as far as
+			 * caller requested events goes. We can remove it here.
 			 */
-			ws = ep_wakeup_source(epi);
-			if (ws) {
-				if (ws->active)
-					__pm_stay_awake(ep->ws);
-				__pm_relax(ws);
-			}
 			__pm_relax(ep_wakeup_source(epi));
-
-			/* Just in case epi becomes active right before __pm_relax() */
-			if (unlikely(ep_item_poll(epi, &pt, depth + 1)))
-				ep_pm_stay_awake(epi);
-
-			__pm_relax(ep->ws);
+			list_del_init(&epi->rdllink);
 		}
 	}
+	ep_done_scan(ep, &txlist);
 	mutex_unlock(&ep->mtx);
 	return res;
 }
@@ -1100,7 +1135,7 @@ again:
 		dispose = __ep_remove(ep, epi, true);
 		mutex_unlock(&ep->mtx);
 
-		if (dispose)
+		if (dispose && ep_refcount_dec_and_test(ep))
 			ep_free(ep);
 		goto again;
 	}
@@ -1116,10 +1151,12 @@ static int ep_alloc(struct eventpoll **pep)
 		return -ENOMEM;
 
 	mutex_init(&ep->mtx);
+	rwlock_init(&ep->lock);
 	init_waitqueue_head(&ep->wq);
 	init_waitqueue_head(&ep->poll_wait);
-	init_llist_head(&ep->rdllist);
+	INIT_LIST_HEAD(&ep->rdllist);
 	ep->rbr = RB_ROOT_CACHED;
+	ep->ovflist = EP_UNACTIVE_PTR;
 	ep->user = get_current_user();
 	refcount_set(&ep->refcount, 1);
 
@@ -1202,10 +1239,93 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
 #endif /* CONFIG_KCMP */
 
 /*
+ * Adds a new entry to the tail of the list in a lockless way, i.e.
+ * multiple CPUs are allowed to call this function concurrently.
+ *
+ * Beware: it is necessary to prevent any other modifications of the
+ *         existing list until all changes are completed, in other words
+ *         concurrent list_add_tail_lockless() calls should be protected
+ *         with a read lock, where write lock acts as a barrier which
+ *         makes sure all list_add_tail_lockless() calls are fully
+ *         completed.
+ *
+ *        Also an element can be locklessly added to the list only in one
+ *        direction i.e. either to the tail or to the head, otherwise
+ *        concurrent access will corrupt the list.
+ *
+ * Return: %false if element has been already added to the list, %true
+ * otherwise.
+ */
+static inline bool list_add_tail_lockless(struct list_head *new,
+					  struct list_head *head)
+{
+	struct list_head *prev;
+
+	/*
+	 * This is simple 'new->next = head' operation, but cmpxchg()
+	 * is used in order to detect that same element has been just
+	 * added to the list from another CPU: the winner observes
+	 * new->next == new.
+	 */
+	if (!try_cmpxchg(&new->next, &new, head))
+		return false;
+
+	/*
+	 * Initially ->next of a new element must be updated with the head
+	 * (we are inserting to the tail) and only then pointers are atomically
+	 * exchanged.  XCHG guarantees memory ordering, thus ->next should be
+	 * updated before pointers are actually swapped and pointers are
+	 * swapped before prev->next is updated.
+	 */
+
+	prev = xchg(&head->prev, new);
+
+	/*
+	 * It is safe to modify prev->next and new->prev, because a new element
+	 * is added only to the tail and new->next is updated before XCHG.
+	 */
+
+	prev->next = new;
+	new->prev = prev;
+
+	return true;
+}
+
+/*
+ * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
+ * i.e. multiple CPUs are allowed to call this function concurrently.
+ *
+ * Return: %false if epi element has been already chained, %true otherwise.
+ */
+static inline bool chain_epi_lockless(struct epitem *epi)
+{
+	struct eventpoll *ep = epi->ep;
+
+	/* Fast preliminary check */
+	if (epi->next != EP_UNACTIVE_PTR)
+		return false;
+
+	/* Check that the same epi has not been just chained from another CPU */
+	if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+		return false;
+
+	/* Atomically exchange tail */
+	epi->next = xchg(&ep->ovflist, epi);
+
+	return true;
+}
+
+/*
  * This is the callback that is passed to the wait queue wakeup
  * mechanism. It is called by the stored file descriptors when they
  * have events to report.
  *
+ * This callback takes a read lock in order not to contend with concurrent
+ * events from another file descriptor, thus all modifications to ->rdllist
+ * or ->ovflist are lockless.  Read lock is paired with the write lock from
+ * ep_start/done_scan(), which stops all list modifications and guarantees
+ * that lists state is seen correctly.
+ *
  * Another thing worth to mention is that ep_poll_callback() can be called
  * concurrently for the same @epi from different CPUs if poll table was inited
  * with several wait queues entries.  Plural wakeup from different CPUs of a
@@ -1215,11 +1335,15 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
  */
 static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
+	int pwake = 0;
 	struct epitem *epi = ep_item_from_wait(wait);
 	struct eventpoll *ep = epi->ep;
 	__poll_t pollflags = key_to_poll(key);
+	unsigned long flags;
 	int ewake = 0;
 
+	read_lock_irqsave(&ep->lock, flags);
+
 	ep_set_busy_poll_napi_id(epi);
 
 	/*
@@ -1229,7 +1353,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 	 * until the next EPOLL_CTL_MOD will be issued.
 	 */
 	if (!(epi->event.events & ~EP_PRIVATE_BITS))
-		goto out;
+		goto out_unlock;
 
 	/*
 	 * Check the events coming with the callback. At this stage, not
@@ -1238,10 +1362,22 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 	 * test for "key" != NULL before the event match test.
 	 */
 	if (pollflags && !(pollflags & epi->event.events))
-		goto out;
+		goto out_unlock;
 
-	ep_pm_stay_awake_rcu(epi);
-	epitem_ready(epi);
+	/*
+	 * If we are transferring events to userspace, we can hold no locks
+	 * (because we're accessing user memory, and because of linux f_op->poll()
+	 * semantics). All the events that happen during that period of time are
+	 * chained in ep->ovflist and requeued later on.
+	 */
+	if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
+		if (chain_epi_lockless(epi))
+			ep_pm_stay_awake_rcu(epi);
+	} else if (!ep_is_linked(epi)) {
+		/* In the usual case, add event to ready list. */
+		if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
+			ep_pm_stay_awake_rcu(epi);
+	}
 
 	/*
 	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
@@ -1270,9 +1406,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 			wake_up(&ep->wq);
 	}
 	if (waitqueue_active(&ep->poll_wait))
+		pwake++;
+
+out_unlock:
+	read_unlock_irqrestore(&ep->lock, flags);
+
+	/* We have to call this outside the lock */
+	if (pwake)
 		ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
 
-out:
 	if (!(epi->event.events & EPOLLEXCLUSIVE))
 		ewake = 1;
 
@@ -1517,6 +1659,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	if (is_file_epoll(tfile))
 		tep = tfile->private_data;
 
+	lockdep_assert_irqs_enabled();
+
 	if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
 					    max_user_watches) >= 0))
 		return -ENOSPC;
@@ -1528,10 +1672,11 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	}
 
 	/* Item initialization follow here ... */
-	init_llist_node(&epi->rdllink);
+	INIT_LIST_HEAD(&epi->rdllink);
 	epi->ep = ep;
 	ep_set_ffd(&epi->ffd, tfile, fd);
 	epi->event = *event;
+	epi->next = EP_UNACTIVE_PTR;
 
 	if (tep)
 		mutex_lock_nested(&tep->mtx, 1);
@@ -1598,13 +1743,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 		return -ENOMEM;
 	}
 
+	/* We have to drop the new item inside our item list to keep track of it */
+	write_lock_irq(&ep->lock);
+
 	/* record NAPI ID of new item if present */
 	ep_set_busy_poll_napi_id(epi);
 
 	/* If the file is already "ready" we drop it inside the ready list */
-	if (revents) {
+	if (revents && !ep_is_linked(epi)) {
+		list_add_tail(&epi->rdllink, &ep->rdllist);
 		ep_pm_stay_awake(epi);
-		epitem_ready(epi);
 
 		/* Notify waiting tasks that events are available */
 		if (waitqueue_active(&ep->wq))
@@ -1613,6 +1761,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 			pwake++;
 	}
 
+	write_unlock_irq(&ep->lock);
+
 	/* We have to call this outside the lock */
 	if (pwake)
 		ep_poll_safewake(ep, NULL, 0);
@@ -1627,8 +1777,11 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 		     const struct epoll_event *event)
 {
+	int pwake = 0;
 	poll_table pt;
 
+	lockdep_assert_irqs_enabled();
+
 	init_poll_funcptr(&pt, NULL);
 
 	/*
@@ -1672,16 +1825,24 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 	 * list, push it inside.
 	 */
 	if (ep_item_poll(epi, &pt, 1)) {
-		ep_pm_stay_awake(epi);
-		epitem_ready(epi);
+		write_lock_irq(&ep->lock);
+		if (!ep_is_linked(epi)) {
+			list_add_tail(&epi->rdllink, &ep->rdllist);
+			ep_pm_stay_awake(epi);
 
-		/* Notify waiting tasks that events are available */
-		if (waitqueue_active(&ep->wq))
-			wake_up(&ep->wq);
-		if (waitqueue_active(&ep->poll_wait))
-			ep_poll_safewake(ep, NULL, 0);
+			/* Notify waiting tasks that events are available */
+			if (waitqueue_active(&ep->wq))
+				wake_up(&ep->wq);
+			if (waitqueue_active(&ep->poll_wait))
+				pwake++;
+		}
+		write_unlock_irq(&ep->lock);
 	}
 
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(ep, NULL, 0);
+
 	return 0;
 }
 
@@ -1689,7 +1850,7 @@ static int ep_send_events(struct eventpoll *ep,
 			  struct epoll_event __user *events, int maxevents)
 {
 	struct epitem *epi, *tmp;
-	LLIST_HEAD(txlist);
+	LIST_HEAD(txlist);
 	poll_table pt;
 	int res = 0;
 
@@ -1704,18 +1865,19 @@ static int ep_send_events(struct eventpoll *ep,
 	init_poll_funcptr(&pt, NULL);
 
 	mutex_lock(&ep->mtx);
+	ep_start_scan(ep, &txlist);
 
-	while (res < maxevents) {
+	/*
+	 * We can loop without lock because we are passed a task private list.
+	 * Items cannot vanish during the loop we are holding ep->mtx.
+	 */
+	list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
 		struct wakeup_source *ws;
-		struct llist_node *n;
 		__poll_t revents;
 
-		n = llist_del_first(&ep->rdllist);
-		if (!n)
+		if (res >= maxevents)
 			break;
 
-		epi = llist_entry(n, struct epitem, rdllink);
-
 		/*
 		 * Activate ep->ws before deactivating epi->ws to prevent
 		 * triggering auto-suspend here (in case we reactive epi->ws
@@ -1732,30 +1894,21 @@ static int ep_send_events(struct eventpoll *ep,
 			__pm_relax(ws);
 		}
 
+		list_del_init(&epi->rdllink);
+
 		/*
 		 * If the event mask intersect the caller-requested one,
 		 * deliver the event to userspace. Again, we are holding ep->mtx,
 		 * so no operations coming from userspace can change the item.
 		 */
 		revents = ep_item_poll(epi, &pt, 1);
-		if (!revents) {
-			init_llist_node(n);
-
-			/*
-			 * Just in case epi becomes ready after ep_item_poll() above, but before
-			 * init_llist_node(). Make sure to add it to the ready list, otherwise an
-			 * event may be lost.
-			 */
-			if (unlikely(ep_item_poll(epi, &pt, 1))) {
-				ep_pm_stay_awake(epi);
-				epitem_ready(epi);
-			}
+		if (!revents)
 			continue;
-		}
 
 		events = epoll_put_uevent(revents, epi->event.data, events);
 		if (!events) {
-			llist_add(&epi->rdllink, &ep->rdllist);
+			list_add(&epi->rdllink, &txlist);
+			ep_pm_stay_awake(epi);
 			if (!res)
 				res = -EFAULT;
 			break;
@@ -1763,31 +1916,25 @@ static int ep_send_events(struct eventpoll *ep,
 		res++;
 		if (epi->event.events & EPOLLONESHOT)
 			epi->event.events &= EP_PRIVATE_BITS;
-		__llist_add(n, &txlist);
-	}
-
-	llist_for_each_entry_safe(epi, tmp, txlist.first, rdllink) {
-		init_llist_node(&epi->rdllink);
-
-		if (!(epi->event.events & EPOLLET)) {
+		else if (!(epi->event.events & EPOLLET)) {
 			/*
-			 * If this file has been added with Level Trigger mode, we need to insert
-			 * back inside the ready list, so that the next call to epoll_wait() will
-			 * check again the events availability.
+			 * If this file has been added with Level
+			 * Trigger mode, we need to insert back inside
+			 * the ready list, so that the next call to
+			 * epoll_wait() will check again the events
+			 * availability. At this point, no one can insert
+			 * into ep->rdllist besides us. The epoll_ctl()
+			 * callers are locked out by
+			 * ep_send_events() holding "mtx" and the
+			 * poll callback will queue them in ep->ovflist.
 			 */
+			list_add_tail(&epi->rdllink, &ep->rdllist);
 			ep_pm_stay_awake(epi);
-			epitem_ready(epi);
 		}
 	}
-
-	__pm_relax(ep->ws);
+	ep_done_scan(ep, &txlist);
 	mutex_unlock(&ep->mtx);
 
-	if (!llist_empty(&ep->rdllist)) {
-		if (waitqueue_active(&ep->wq))
-			wake_up(&ep->wq);
-	}
-
 	return res;
 }
 
@@ -1880,6 +2027,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	wait_queue_entry_t wait;
 	ktime_t expires, *to = NULL;
 
+	lockdep_assert_irqs_enabled();
+
 	if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
 		slack = select_estimate_accuracy(timeout);
 		to = &expires;
@@ -1939,15 +2088,54 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		init_wait(&wait);
 		wait.func = ep_autoremove_wake_function;
 
-		prepare_to_wait_exclusive(&ep->wq, &wait, TASK_INTERRUPTIBLE);
+		write_lock_irq(&ep->lock);
+		/*
+		 * Barrierless variant, waitqueue_active() is called under
+		 * the same lock on wakeup ep_poll_callback() side, so it
+		 * is safe to avoid an explicit barrier.
+		 */
+		__set_current_state(TASK_INTERRUPTIBLE);
+
+		/*
+		 * Do the final check under the lock. ep_start/done_scan()
+		 * plays with two lists (->rdllist and ->ovflist) and there
+		 * is always a race when both lists are empty for short
+		 * period of time although events are pending, so lock is
+		 * important.
+		 */
+		eavail = ep_events_available(ep);
+		if (!eavail)
+			__add_wait_queue_exclusive(&ep->wq, &wait);
+
+		write_unlock_irq(&ep->lock);
 
-		if (!ep_events_available(ep))
+		if (!eavail)
 			timed_out = !ep_schedule_timeout(to) ||
 				!schedule_hrtimeout_range(to, slack,
 							  HRTIMER_MODE_ABS);
+		__set_current_state(TASK_RUNNING);
 
-		finish_wait(&ep->wq, &wait);
-		eavail = ep_events_available(ep);
+		/*
+		 * We were woken up, thus go and try to harvest some events.
+		 * If timed out and still on the wait queue, recheck eavail
+		 * carefully under lock, below.
+		 */
+		eavail = 1;
+
+		if (!list_empty_careful(&wait.entry)) {
+			write_lock_irq(&ep->lock);
+			/*
+			 * If the thread timed out and is not on the wait queue,
+			 * it means that the thread was woken up after its
+			 * timeout expired before it could reacquire the lock.
+			 * Thus, when wait.entry is empty, it needs to harvest
+			 * events.
+			 */
+			if (timed_out)
+				eavail = list_empty(&wait.entry);
+			__remove_wait_queue(&ep->wq, &wait);
+			write_unlock_irq(&ep->lock);
+		}
 	}
 }
 
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index fafa86273f12..63d17cea2e95 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -8573,11 +8573,6 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
 		goto err_out;
 	}
 
-	opinfo->op_state = OPLOCK_STATE_NONE;
-	wake_up_interruptible_all(&opinfo->oplock_q);
-	opinfo_put(opinfo);
-	ksmbd_fd_put(work, fp);
-
 	rsp->StructureSize = cpu_to_le16(24);
 	rsp->OplockLevel = rsp_oplevel;
 	rsp->Reserved = 0;
@@ -8585,16 +8580,15 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
 	rsp->VolatileFid = volatile_id;
 	rsp->PersistentFid = persistent_id;
 	ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_oplock_break));
-	if (!ret)
-		return;
-
+	if (ret) {
 err_out:
+		smb2_set_err_rsp(work);
+	}
+
 	opinfo->op_state = OPLOCK_STATE_NONE;
 	wake_up_interruptible_all(&opinfo->oplock_q);
-
 	opinfo_put(opinfo);
 	ksmbd_fd_put(work, fp);
-	smb2_set_err_rsp(work);
 }
 
 static int check_lease_state(struct lease *lease, __le32 req_state)
@@ -8724,11 +8718,6 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
 	}
 
 	lease_state = lease->state;
-	opinfo->op_state = OPLOCK_STATE_NONE;
-	wake_up_interruptible_all(&opinfo->oplock_q);
-	atomic_dec(&opinfo->breaking_cnt);
-	wake_up_interruptible_all(&opinfo->oplock_brk);
-	opinfo_put(opinfo);
 
 	rsp->StructureSize = cpu_to_le16(36);
 	rsp->Reserved = 0;
@@ -8737,16 +8726,16 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
 	rsp->LeaseState = lease_state;
 	rsp->LeaseDuration = 0;
 	ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lease_ack));
-	if (!ret)
-		return;
-
+	if (ret) {
 err_out:
+		smb2_set_err_rsp(work);
+	}
+
+	opinfo->op_state = OPLOCK_STATE_NONE;
 	wake_up_interruptible_all(&opinfo->oplock_q);
 	atomic_dec(&opinfo->breaking_cnt);
 	wake_up_interruptible_all(&opinfo->oplock_brk);
-
 	opinfo_put(opinfo);
-	smb2_set_err_rsp(work);
 }
 
 /**
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 64a428a06ace..c6cbe0d56e32 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -433,7 +433,8 @@ static void free_transport(struct smb_direct_transport *t)
 	if (t->qp) {
 		ib_drain_qp(t->qp);
 		ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs);
-		ib_destroy_qp(t->qp);
+		t->qp = NULL;
+		rdma_destroy_qp(t->cm_id);
 	}
 
 	ksmbd_debug(RDMA, "drain the reassembly queue\n");
@@ -1940,8 +1941,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
 	return 0;
 err:
 	if (t->qp) {
-		ib_destroy_qp(t->qp);
 		t->qp = NULL;
+		rdma_destroy_qp(t->cm_id);
 	}
 	if (t->recv_cq) {
 		ib_destroy_cq(t->recv_cq);
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 0f3aad12e495..d3437f6644e3 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -1282,6 +1282,7 @@ out1:
 
 		err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry);
 		if (err) {
+			mnt_drop_write(parent_path->mnt);
 			path_put(path);
 			path_put(parent_path);
 		}